diff --git a/.clang-format b/.clang-format index 4311596850..3e49ddce6e 100644 --- a/.clang-format +++ b/.clang-format @@ -8,6 +8,7 @@ AlignEscapedNewlines: true AlignOperands: Align AllowShortIfStatementsOnASingleLine: AllIfsAndElse ColumnLimit: 150 +PenaltyBreakOpenParenthesis: 100 ReflowComments: false CommentPragmas: 'TESTARGS' DerivePointerAlignment: false diff --git a/.clang-tidy b/.clang-tidy index ab45c266bf..04cd208737 100644 --- a/.clang-tidy +++ b/.clang-tidy @@ -1,3 +1,3 @@ -Checks: "clang-diagnostic-*,clang-analyzer-*,readability-inconsistent-declaration-parameter-name,bugprone-too-small-loop-variable" +Checks: "clang-diagnostic-*,clang-analyzer-*,readability-inconsistent-declaration-parameter-name,bugprone-too-small-loop-variable,-clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling" HeaderFilterRegex: .* WarningsAsErrors: "clang-diagnostic-*,clang-analyzer-*,readability-inconsistent-declaration-parameter-name" diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md new file mode 100644 index 0000000000..9c6f967319 --- /dev/null +++ b/.github/pull_request_template.md @@ -0,0 +1,12 @@ +Purpose: + +Describe the purpose of the PR here. + +Closes: #ISSUE_NUMBER + +LLM/GenAI Disclosure: + +Describe any LLM and GenAI usage here. + +By submitting this PR, the author certifies to its contents as described by the [Developer's Certificate of Origin](https://developercertificate.org/). +Please follow the [Contributing Guidelines](https://github.com/CEED/libCEED/blob/main/CONTRIBUTING.md) for all PRs. diff --git a/.github/workflows/c-fortran-test-hardware.yml b/.github/workflows/c-fortan-test-ppc64le.yml similarity index 72% rename from .github/workflows/c-fortran-test-hardware.yml rename to .github/workflows/c-fortan-test-ppc64le.yml index 7dd7626ebf..f710c9ba12 100644 --- a/.github/workflows/c-fortran-test-hardware.yml +++ b/.github/workflows/c-fortan-test-ppc64le.yml @@ -1,4 +1,4 @@ -name: ARM and IBM Power +name: IBM Power on: push: @@ -10,9 +10,9 @@ jobs: test: strategy: matrix: - os: [ubuntu-22.04] - compiler: [gcc-13] - arch: [aarch64, ppc64le] + os: [ubuntu-24.04] + compiler: [gcc] + arch: [ppc64le] distro: [ubuntu22.04] runs-on: ${{ matrix.os }} @@ -21,10 +21,10 @@ jobs: - name: Environment setup uses: actions/checkout@v4 - name: Hardware setup and test libCEED - uses: uraimo/run-on-arch-action@v2 + uses: uraimo/run-on-arch-action@v3 env: CC: ${{ matrix.compiler }} - FC: gfortran-13 + FC: gfortran id: runcmd with: arch: ${{ matrix.arch }} @@ -36,5 +36,5 @@ jobs: apt-get install -y python3 uname -a make info - make -j2 - PROVE_OPTS=-v make prove -j2 + make -j + make prove -j search="t5 ex" diff --git a/.github/workflows/c-fortran-test-arm64.yml b/.github/workflows/c-fortran-test-arm64.yml new file mode 100644 index 0000000000..6927f37b68 --- /dev/null +++ b/.github/workflows/c-fortran-test-arm64.yml @@ -0,0 +1,28 @@ +name: ARM + +on: + push: + branches: + - main + pull_request: + +jobs: + test: + strategy: + matrix: + os: [ubuntu-24.04-arm] + compiler: [gcc, clang] + + runs-on: ${{ matrix.os }} + + steps: + - name: Environment setup + uses: actions/checkout@v4 + - name: Build and test libCEED + env: + CC: ${{ matrix.compiler }} + FC: gfortran + run: | + make info + make -j + make prove -j diff --git a/.github/workflows/c-fortran-test-icc.yml b/.github/workflows/c-fortran-test-icc.yml index fc5f3407cd..4e854195b1 100644 --- a/.github/workflows/c-fortran-test-icc.yml +++ b/.github/workflows/c-fortran-test-icc.yml @@ -14,7 +14,7 @@ jobs: test: strategy: matrix: - os: [ubuntu-22.04] + os: [ubuntu-24.04] runs-on: ${{ matrix.os }} @@ -32,6 +32,6 @@ jobs: export CC=icx CXX=icx FC=ifx export OPENMP=1 make info - make -j2 - PROVE_OPTS=-v make prove -j2 + make -j + make prove -j diff --git a/.github/workflows/c-fortran-test-linux-osx.yml b/.github/workflows/c-fortran-test-linux-osx.yml index 806cbcc16d..52df23c8d1 100644 --- a/.github/workflows/c-fortran-test-linux-osx.yml +++ b/.github/workflows/c-fortran-test-linux-osx.yml @@ -10,19 +10,44 @@ jobs: test: strategy: matrix: - os: [ubuntu-22.04, macos-13] - compiler: [gcc-13, clang] + os: [ubuntu-24.04, macos-15] + compiler: [gcc, clang] + include: + - os: macos-15 + compiler: apple-clang runs-on: ${{ matrix.os }} steps: - name: Environment setup uses: actions/checkout@v4 + - name: Set compiler + run: | + case "${{ matrix.compiler }}" in + gcc) + if [[ "${{ matrix.os }}" == macos-* ]]; then + echo "CC=gcc-15" >> $GITHUB_ENV + else + echo "CC=gcc" >> $GITHUB_ENV + fi + ;; + clang) + if [[ "${{ matrix.os }}" == macos-* ]]; then + echo "CC=$(brew --prefix llvm@18)/bin/clang" >> $GITHUB_ENV + else + echo "CC=clang" >> $GITHUB_ENV + fi + ;; + apple-clang) + echo "CC=clang" >> $GITHUB_ENV + ;; + esac + - name: Show compiler version + run: $CC --version | head -1 - name: Build and test libCEED env: - CC: ${{ matrix.compiler }} - FC: gfortran-13 + FC: gfortran-14 run: | make info - make -j2 - PROVE_OPTS=-v make prove -j2 + make -j + make prove -j2 diff --git a/.github/workflows/c-fortran-test-style.yml b/.github/workflows/c-fortran-test-style.yml index 4f2fcb4054..ff55101bde 100644 --- a/.github/workflows/c-fortran-test-style.yml +++ b/.github/workflows/c-fortran-test-style.yml @@ -10,7 +10,7 @@ jobs: test: strategy: matrix: - os: [ubuntu-22.04] + os: [ubuntu-24.04] compiler: [clang] runs-on: ${{ matrix.os }} @@ -21,12 +21,12 @@ jobs: - name: Install clang-format run: | wget -O- https://apt.llvm.org/llvm-snapshot.gpg.key | sudo apt-key add - - sudo add-apt-repository 'deb http://apt.llvm.org/bullseye/ llvm-toolchain-bullseye-17 main' - sudo apt update && sudo apt install clang-format-17 + sudo add-apt-repository 'deb http://apt.llvm.org/noble/ llvm-toolchain-noble-19 main' + sudo apt update && sudo apt install clang-format-19 - name: C style env: CC: ${{ matrix.compiler }} - FC: gfortran-11 + FC: gfortran run: | make info - make format-c -j2 CLANG_FORMAT=clang-format-17 && git diff --exit-code + make format-c -j CLANG_FORMAT=clang-format-19 && git diff --exit-code diff --git a/.github/workflows/julia-documentation.yml b/.github/workflows/julia-documentation.yml index d7a432426f..b90bb1bb1e 100644 --- a/.github/workflows/julia-documentation.yml +++ b/.github/workflows/julia-documentation.yml @@ -9,7 +9,7 @@ on: jobs: build: - runs-on: ubuntu-22.04 + runs-on: ubuntu-24.04 steps: - uses: actions/checkout@v4 - uses: julia-actions/setup-julia@latest diff --git a/.github/workflows/julia-test-with-style.yml b/.github/workflows/julia-test-with-style.yml index b74434ff49..a292c9550b 100644 --- a/.github/workflows/julia-test-with-style.yml +++ b/.github/workflows/julia-test-with-style.yml @@ -10,7 +10,7 @@ jobs: test: strategy: matrix: - os: [ubuntu-22.04] + os: [ubuntu-24.04] julia-version: ['1'] runs-on: ${{ matrix.os }} diff --git a/.github/workflows/python-test-with-style.yml b/.github/workflows/python-test-with-style.yml index a8fc0af33c..4c2764b244 100644 --- a/.github/workflows/python-test-with-style.yml +++ b/.github/workflows/python-test-with-style.yml @@ -10,8 +10,8 @@ jobs: test: strategy: matrix: - os: [ubuntu-22.04] - compiler: [gcc-13] + os: [ubuntu-24.04] + compiler: [gcc] python-version: ['3.x'] runs-on: ${{ matrix.os }} @@ -30,7 +30,7 @@ jobs: - name: Python test env: CC: ${{ matrix.compiler }} - FC: gfortran-13 + FC: gfortran run: | make info make -j2 @@ -38,16 +38,18 @@ jobs: pip install . cd python/tests PYTHON=python3 make test TEST_OPTS="--ceed /cpu/self/ref/serial -vv" + cd ../../examples/python + PYTHON=python3 make test TEST_OPTS="--ceed /cpu/self/ref/serial -vv" cd ../.. - name: Python style env: CC: ${{ matrix.compiler }} - FC: gfortran-13 + FC: gfortran run: | make format-py && git diff --exit-code - name: Python version env: CC: ${{ matrix.compiler }} - FC: gfortran-13 + FC: gfortran run: | make vermin diff --git a/.github/workflows/release-notes.yml b/.github/workflows/release-notes.yml index 8d90b2490d..a4fa213618 100644 --- a/.github/workflows/release-notes.yml +++ b/.github/workflows/release-notes.yml @@ -10,7 +10,7 @@ jobs: test: strategy: matrix: - os: [ubuntu-22.04] + os: [ubuntu-24.04] runs-on: ${{ matrix.os }} diff --git a/.github/workflows/rust-documentation.yml b/.github/workflows/rust-documentation.yml index b0ca00c440..4d6410548a 100644 --- a/.github/workflows/rust-documentation.yml +++ b/.github/workflows/rust-documentation.yml @@ -10,7 +10,7 @@ jobs: test: strategy: matrix: - os: [ubuntu-22.04] + os: [ubuntu-24.04] compiler: [clang] runs-on: ${{ matrix.os }} diff --git a/.github/workflows/rust-test-with-style.yml b/.github/workflows/rust-test-with-style.yml index 0626ecb989..63ce2d7b3c 100644 --- a/.github/workflows/rust-test-with-style.yml +++ b/.github/workflows/rust-test-with-style.yml @@ -10,7 +10,7 @@ jobs: test: strategy: matrix: - os: [ubuntu-22.04] + os: [ubuntu-24.04, macos-15] compiler: [clang] runs-on: ${{ matrix.os }} @@ -31,8 +31,8 @@ jobs: - name: Rust test with coverage env: CC: ${{ matrix.compiler }} - FC: gfortran-11 - run: cargo llvm-cov test --doctests --lcov --output-path lcov.info + FC: gfortran + run: CARGO_CEED_OPT_FLAGS="-g -O0 -fno-inline" cargo llvm-cov test --doctests --lcov --output-path lcov.info - name: Codecov upload uses: codecov/codecov-action@v4 with: @@ -42,7 +42,7 @@ jobs: style: strategy: matrix: - os: [ubuntu-22.04] + os: [ubuntu-24.04] compiler: [clang] runs-on: ${{ matrix.os }} diff --git a/.gitignore b/.gitignore index 7e7115b20e..0cb9d41a69 100644 --- a/.gitignore +++ b/.gitignore @@ -7,6 +7,7 @@ lib/* # General *.o *.so +*.so.* *.d *.DIR ceed.pc @@ -50,6 +51,12 @@ doc/sphinx/build/ # Example docs automatically copied from source tree doc/sphinx/source/examples/ +# Clang GPU temp files +temp/* + +# Nek5K +SESSION.NAME + # Output files, videos, and compressed archives should not be added accidentally *.avi *.bin @@ -91,3 +98,10 @@ libCEED.includes *.aux *.fdb_latexmk *.fls + +# profiling files +*.txt +*.proto +*.csv + +.venv diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 58f7e93f59..d3188148a5 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -5,6 +5,10 @@ stages: - test:cpu-and-tidy - test:gpu-and-float +workflow: + auto_cancel: + on_job_failure: all + # ---------------------------------------------------------------------------------------- # Memcheck backends + ASAN @@ -15,36 +19,37 @@ noether-asan: - cpu interruptible: true before_script: -# Environment -# Note: COVERAGE=0 is needed when using ASAN + # Environment + # Note: COVERAGE=0 is needed when using ASAN - export COVERAGE=0 CC=gcc CXX=g++ FC=gfortran - export NPROC_POOL=8 - echo "-------------- nproc ---------------" && NPROC_CPU=$(nproc) && NPROC_GPU=$(($(nproc)<8?$(nproc):8)) && echo "NPROC_CPU" $NPROC_CPU && echo "NPROC_GPU" $NPROC_GPU - echo "-------------- CC ------------------" && $CC --version - echo "-------------- CXX -----------------" && $CXX --version - echo "-------------- FC ------------------" && $FC --version -# ASAN + # ASAN - echo "-------------- ASAN ----------------" - export ASAN=1 AFLAGS="-fsanitize=address -fsanitize=leak" - echo $AFLAGS script: - rm -f .SUCCESS -# libCEED - - make configure OPT='-O -march=native -ffp-contract=fast' + # libCEED + - make configure OPT='-g -O0 -fno-inline -march=native -ffp-contract=fast' - BACKENDS_CPU=$(make info-backends-all | grep -o '/cpu[^ ]*' | tr '\n' ' ') - echo "-------------- libCEED -------------" && make info - echo "-------------- BACKENDS_CPU --------" && echo $BACKENDS_CPU + - make clean - make -j$NPROC_CPU -# -- libCEED only tests + # -- libCEED only tests - echo "-------------- core tests ----------" - echo '[{"subject":"/","metrics":[{"name":"Transfer Size (KB)","value":"19.5","desiredSize":"smaller"},{"name":"Speed Index","value":0,"desiredSize":"smaller"},{"name":"Total Score","value":92,"desiredSize":"larger"},{"name":"Requests","value":4,"desiredSize":"smaller"}]}]' > performance.json -# Note: PETSC_DIR is set by default in GitLab runner env, unsetting to isolate core tests + # Note: PETSC_DIR is set by default in GitLab runner env, unsetting to isolate core tests - export PETSC_DIR= PETSC_ARCH= - make -k -j$((NPROC_CPU / NPROC_POOL)) BACKENDS="$BACKENDS_CPU" JUNIT_BATCH="memcheck" junit realsearch=% -# Clang-tidy + # Clang-tidy - echo "-------------- clang-tidy ----------" && clang-tidy --version - TIDY_OPTS="-fix-errors" make -j$NPROC_CPU tidy && git diff --color=always --exit-code -# Report status + # Report status - touch .SUCCESS artifacts: paths: @@ -63,7 +68,7 @@ noether-cpu: - cpu interruptible: true before_script: -# Environment + # Environment - export COVERAGE=1 CC=gcc CXX=g++ FC=gfortran - export NPROC_POOL=8 - echo "-------------- nproc ---------------" && NPROC_CPU=$(nproc) && NPROC_GPU=$(($(nproc)<8?$(nproc):8)) && echo "NPROC_CPU" $NPROC_CPU && echo "NPROC_GPU" $NPROC_GPU @@ -71,55 +76,52 @@ noether-cpu: - echo "-------------- CXX -----------------" && $CXX --version - echo "-------------- FC ------------------" && $FC --version - echo "-------------- GCOV ----------------" && gcov --version -# Libraries for backends -# -- LIBXSMM 2c145a109b5a8ad4e15f60ea42a86b9056bdc8b8 - - cd .. && export XSMM_HASH=2c145a109b5a8ad4e15f60ea42a86b9056bdc8b8 && { [[ -d libxsmm-$XSMM_HASH ]] || { curl -L https://github.com/libxsmm/libxsmm/archive/$XSMM_HASH.tar.gz -o xsmm.tar.gz && tar zvxf xsmm.tar.gz && rm xsmm.tar.gz && make -C libxsmm-$XSMM_HASH -j$(nproc); }; } && export XSMM_DIR=$PWD/libxsmm-$XSMM_HASH && cd libCEED + # Libraries for backends + # -- LIBXSMM 7 April 2024 + - cd .. && export XSMM_HASH=94ee71576870152feb62f3f0cf6b061d036dcdb5 && { [[ -d libxsmm-$XSMM_HASH ]] || { curl -L https://github.com/libxsmm/libxsmm/archive/$XSMM_HASH.tar.gz -o xsmm.tar.gz && tar zvxf xsmm.tar.gz && rm xsmm.tar.gz && make -C libxsmm-$XSMM_HASH -j$(nproc); }; } && export XSMM_DIR=$PWD/libxsmm-$XSMM_HASH && cd libCEED - echo "-------------- LIBXSMM -------------" && basename $XSMM_DIR -# -- OCCA v1.6.0 - - cd .. && export OCCA_VERSION=occa-1.6.0 && { [[ -d $OCCA_VERSION ]] || { git clone --depth 1 --branch v1.6.0 https://github.com/libocca/occa.git $OCCA_VERSION && cd $OCCA_VERSION && export ENABLE_OPENCL="OFF" ENABLE_DPCPP="OFF" ENABLE_HIP="OFF" ENABLE_CUDA="OFF" && ./configure-cmake.sh && cmake --build build --parallel $NPROC_CPU && cmake --install build && cd ..; }; } && export OCCA_DIR=$PWD/$OCCA_VERSION/install && cd libCEED - - echo "-------------- OCCA ----------------" && git -C $OCCA_DIR describe --tags && LD_LIBRARY_PATH=$OCCA_DIR/lib $OCCA_DIR/bin/occa info script: - rm -f .SUCCESS -# libCEED - - make configure OPT='-O -march=native -ffp-contract=fast' + # libCEED + - make configure OPT='-g -O0 -fno-inline -march=native -ffp-contract=fast' - BACKENDS_CPU=$(make info-backends-all | grep -o '/cpu[^ ]*' | tr '\n' ' ') - echo "-------------- libCEED -------------" && make info - echo "-------------- BACKENDS_CPU --------" && echo $BACKENDS_CPU - - OCCA_DIR= PEDANTIC=1 make -j$NPROC_CPU + - make clean + - PEDANTIC=1 make -j$NPROC_CPU - make -j$NPROC_CPU -# -- libCEED only tests + # -- libCEED only tests - echo "-------------- core tests ----------" - echo '[{"subject":"/","metrics":[{"name":"Transfer Size (KB)","value":"19.5","desiredSize":"smaller"},{"name":"Speed Index","value":0,"desiredSize":"smaller"},{"name":"Total Score","value":92,"desiredSize":"larger"},{"name":"Requests","value":4,"desiredSize":"smaller"}]}]' > performance.json -# Note: PETSC_DIR is set by default in GitLab runner env, unsetting to isolate core tests + # Note: PETSC_DIR is set by default in GitLab runner env, unsetting to isolate core tests - export PETSC_DIR= PETSC_ARCH= - make -k -j$((NPROC_CPU / NPROC_POOL)) BACKENDS="$BACKENDS_CPU" JUNIT_BATCH="cpu" junit realsearch=% -# Libraries for examples -# -- PETSc with HIP (minimal) - - export PETSC_DIR=/projects/petsc PETSC_ARCH=mpich-hip-int64 && git -C $PETSC_DIR -c safe.directory=$PETSC_DIR describe - - source /home/jawr8143/SmartSimTestingSoftware/bin/activate && export SMARTREDIS_DIR=/home/jawr8143/SmartSimTestingSoftware/smartredis/install + # Libraries for examples + # -- PETSc (minimal) + - export PETSC_DIR=/projects/petsc PETSC_ARCH=mpich-cpu-int64 && git -C $PETSC_DIR -c safe.directory=$PETSC_DIR describe - echo "-------------- PETSc ---------------" && make -C $PETSC_DIR info - - make -k -j$((NPROC_CPU / NPROC_POOL)) BACKENDS="$BACKENDS_CPU" JUNIT_BATCH="cpu" junit search="petsc fluids solids" -# -- MFEM v4.6 - - cd .. && export MFEM_VERSION=mfem-4.6 && { [[ -d $MFEM_VERSION ]] || { git clone --depth 1 --branch v4.6 https://github.com/mfem/mfem.git $MFEM_VERSION && make -C $MFEM_VERSION -j$(nproc) serial CXXFLAGS="-O -std=c++11"; }; } && export MFEM_DIR=$PWD/$MFEM_VERSION && cd libCEED + - make -k -j$((NPROC_CPU / NPROC_POOL)) BACKENDS="$BACKENDS_CPU" JUNIT_BATCH="cpu" junit search="petsc fluids-navierstokes solids" + # -- MFEM v4.7 + - cd .. && export MFEM_VERSION=mfem-4.7 && { [[ -d $MFEM_VERSION ]] || { git clone --depth 1 --branch v4.7 https://github.com/mfem/mfem.git $MFEM_VERSION && make -C $MFEM_VERSION -j$(nproc) serial CXXFLAGS="-O -std=c++11"; }; } && export MFEM_DIR=$PWD/$MFEM_VERSION && cd libCEED - echo "-------------- MFEM ----------------" && make -C $MFEM_DIR info - make -k -j$((NPROC_CPU / NPROC_POOL)) BACKENDS="$BACKENDS_CPU" JUNIT_BATCH="cpu" junit search=mfem -# -- Nek5000 v19.0 + # -- Nek5000 v19.0 - export COVERAGE=0 - cd .. && export NEK5K_VERSION=Nek5000-19.0 && { [[ -d $NEK5K_VERSION ]] || { git clone --depth 1 --branch v19.0 https://github.com/Nek5000/Nek5000.git $NEK5K_VERSION && cd $NEK5K_VERSION/tools && ./maketools genbox genmap reatore2 && cd ../..; }; } && export NEK5K_DIR=$PWD/$NEK5K_VERSION && export PATH=$NEK5K_DIR/bin:$PATH MPI=0 && cd libCEED - echo "-------------- Nek5000 -------------" && git -C $NEK5K_DIR describe --tags - export NPROC_POOL=1 - make -k -j$NPROC_CPU BACKENDS="$BACKENDS_CPU" JUNIT_BATCH="cpu" junit search=nek NEK5K_DIR=$NEK5K_DIR -# -- deal.II 8bd5c262f13e15793aa206b6eed8774a9b25ce11 - - OCCA_DIR= BACKENDS_CPU=$(make info-backends-all | grep -o '/cpu[^ ]*' | tr '\n' ' ') + # -- deal.II 8bd5c262f13e15793aa206b6eed8774a9b25ce11 + - BACKENDS_CPU=$(make info-backends-all | grep -o '/cpu[^ ]*' | tr '\n' ' ') - export DEAL_II_ROOT_DIR=/projects/dealii DEAL_II_DIR=/projects/dealii/install - echo "-------------- deal.II -------------" && git -C $DEAL_II_ROOT_DIR -c safe.directory=$DEAL_II_ROOT_DIR describe --always - make -k -j$NPROC_CPU BACKENDS="$BACKENDS_CPU" JUNIT_BATCH="cpu" junit search=dealii DEAL_II_DIR=$DEAL_II_DIR -# Report status + # Report status - touch .SUCCESS after_script: - | if [ -f .SUCCESS ]; then - lcov --directory . --capture --output-file coverage.info; + lcov --directory . --capture --output-file coverage.info --ignore-errors source,mismatch,unused --substitute 's#(t*-f.h)#test/(t*.-f.h)#g'; bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F interface; bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F gallery; bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F backends; @@ -137,29 +139,88 @@ noether-cpu: # ---------------------------------------------------------------------------------------- # Check SYCL backends build # ---------------------------------------------------------------------------------------- -noether-sycl: + +# SYCL tests currently disabled + +#noether-sycl: +# stage: test:gpu-and-float +# tags: +# - sycl +# interruptible: true +# before_script: +# # Environment +# - . /opt/intel/oneapi/setvars.sh +# - export COVERAGE=1 CC=icx CXX=icpx +# - export NPROC_POOL=8 +# - echo "-------------- nproc ---------------" && NPROC_CPU=$(nproc) && NPROC_GPU=$(($(nproc)<8?$(nproc):8)) && echo "NPROC_CPU" $NPROC_CPU && echo "NPROC_GPU" $NPROC_GPU +# - echo "-------------- CC ------------------" && $CC --version +# - echo "-------------- CXX -----------------" && $CXX --version +# script: +# - rm -f .SUCCESS +# # libCEED +# - make configure SYCL_DIR=/opt/intel/oneapi/compiler/latest OPT='-g -O0 -fno-inline -march=native -ffp-contract=fast' +# - BACKENDS_SYCL=$(make info-backends-all | grep -o '/sycl[^ ]*' | tr '\n' ' ') +# - echo "-------------- libCEED -------------" && make info +# - echo "-------------- BACKENDS_SYCL -------" && echo $BACKENDS_SYCL +# - make clean +# - make -j$NPROC_CPU +# # Report status +# - touch .SUCCESS + + +# ---------------------------------------------------------------------------------------- +# Rust + CUDA +# ---------------------------------------------------------------------------------------- +noether-rust-qfunctions: stage: test:gpu-and-float tags: - - sycl + - cuda interruptible: true before_script: -# Environment - - . /opt/intel/oneapi/setvars.sh - - export COVERAGE=1 CC=icx CXX=icpx - - export NPROC_POOL=8 + # Environment + - export COVERAGE=1 CC=gcc CXX=g++ FC=gfortran NVCC=nvcc GPU_CLANG=1 + - export NPROC_POOL=1 - echo "-------------- nproc ---------------" && NPROC_CPU=$(nproc) && NPROC_GPU=$(($(nproc)<8?$(nproc):8)) && echo "NPROC_CPU" $NPROC_CPU && echo "NPROC_GPU" $NPROC_GPU - echo "-------------- CC ------------------" && $CC --version - echo "-------------- CXX -----------------" && $CXX --version + - echo "-------------- FC ------------------" && $FC --version + - echo "-------------- NVCC ----------------" && $NVCC --version + - echo "-------------- Rustc ---------------" && rustc --version + - echo "-------------- Clang++ -------------" && clang++ --version + - echo "-------------- GCOV ----------------" && gcov --version script: - rm -f .SUCCESS -# libCEED - - make configure SYCL_DIR=/opt/intel/oneapi/compiler/latest OPT='-O -march=native -ffp-contract=fast' - - BACKENDS_SYCL=$(make info-backends-all | grep -o '/sycl[^ ]*' | tr '\n' ' ') + # Rustup + - rustup update nightly + - rustup component add rust-src --toolchain nightly + - rustup component add llvm-tools --toolchain nightly + # libCEED + - make configure OPT='-g -O0 -fno-inline -march=native -ffp-contract=fast' CUDA_DIR=/usr/local/cuda-12.9 - echo "-------------- libCEED -------------" && make info - - echo "-------------- BACKENDS_SYCL -------" && echo $BACKENDS_SYCL - - make -j$NPROC_CPU -# Report status + - make clean + - make -k -j$NPROC_CPU -l$NPROC_CPU + # -- libCEED only tests + - echo "-------------- Rust QFunction tests -----" + # Note: PETSC_DIR is set by default in GitLab runner env, unsetting to isolate core tests + - export PETSC_DIR= PETSC_ARCH= + - make -k -j$((NPROC_GPU / NPROC_POOL)) JUNIT_BATCH="rust-qfunction" junit search=rustqfunction + # Report status - touch .SUCCESS + after_script: + - | + if [ -f .SUCCESS ]; then + lcov --directory . --capture --output-file coverage.info --ignore-errors source,mismatch,unused --substitute 's#(t*-f.h)#test/(t*.-f.h)#g'; + bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F interface; + bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F gallery; + bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F backends; + bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F tests; + bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F examples; + fi + artifacts: + paths: + - build/*.junit + reports: + junit: build/*.junit # ---------------------------------------------------------------------------------------- @@ -171,7 +232,7 @@ noether-cuda: - cuda interruptible: true before_script: -# Environment + # Environment - export COVERAGE=1 CC=gcc CXX=g++ FC=gfortran NVCC=nvcc - export NPROC_POOL=4 - echo "-------------- nproc ---------------" && NPROC_CPU=$(nproc) && NPROC_GPU=$(($(nproc)<8?$(nproc):8)) && echo "NPROC_CPU" $NPROC_CPU && echo "NPROC_GPU" $NPROC_GPU @@ -180,34 +241,56 @@ noether-cuda: - echo "-------------- FC ------------------" && $FC --version - echo "-------------- NVCC ----------------" && $NVCC --version - echo "-------------- GCOV ----------------" && gcov --version + # ASAN + - echo "-------------- ASAN ----------------" + - export ASAN=1 AFLAGS="-fsanitize=address -fsanitize=leak" ASAN_OPTIONS=protect_shadow_gap=0 + - echo $AFLAGS script: - rm -f .SUCCESS -# libCEED - - make configure OPT='-O -march=native -ffp-contract=fast' CUDA_DIR=/usr + # libCEED + - make configure OPT='-g -O0 -fno-inline -march=native -ffp-contract=fast' CUDA_DIR=/usr/local/cuda-12.9 - echo "-------------- libCEED -------------" && make info - BACKENDS_GPU=$(make info-backends | grep -o '/gpu[^ ]*' | tr '\n' ' ') - echo "-------------- BACKENDS_GPU --------" && echo $BACKENDS_GPU + - make clean - PEDANTIC=1 make -k -j$NPROC_CPU -l$NPROC_CPU -# -- libCEED only tests + # -- libCEED only tests - echo "-------------- core tests ----------" -# Note: PETSC_DIR is set by default in GitLab runner env, unsetting to isolate core tests + # Note: PETSC_DIR is set by default in GitLab runner env, unsetting to isolate core tests - export PETSC_DIR= PETSC_ARCH= - make -k -j$((NPROC_GPU / NPROC_POOL)) BACKENDS="$BACKENDS_GPU" JUNIT_BATCH="cuda" junit realsearch=% -# Libraries for examples -# -- PETSc with CUDA (minimal) + # Rebuild without ASAN + - unset ASAN AFLAGS ASAN_OPTIONS + - make clean + - PEDANTIC=1 make -k -j$NPROC_CPU -l$NPROC_CPU + # Libraries for examples + # -- PETSc with CUDA (minimal) - export PETSC_DIR=/projects/petsc PETSC_ARCH=mpich-cuda-O PETSC_OPTIONS='-use_gpu_aware_mpi 0' && git -C $PETSC_DIR -c safe.directory=$PETSC_DIR describe - - source /home/jawr8143/SmartSimTestingSoftware/bin/activate && export SMARTREDIS_DIR=/home/jawr8143/SmartSimTestingSoftware/smartredis/install - echo "-------------- PETSc ---------------" && make -C $PETSC_DIR info - - make -k -j$((NPROC_GPU / NPROC_POOL)) JUNIT_BATCH="cuda" junit BACKENDS="$BACKENDS_GPU" search="petsc fluids solids" -# Clang-tidy + - make -k -j$((NPROC_GPU / NPROC_POOL)) BACKENDS="$BACKENDS_GPU" JUNIT_BATCH="cuda" junit search="petsc fluids solids" + # -- MFEM v4.7 + - cd .. && export MFEM_VERSION=mfem-4.7 && { [[ -d $MFEM_VERSION ]] || { git clone --depth 1 --branch v4.7 https://github.com/mfem/mfem.git $MFEM_VERSION && make -C $MFEM_VERSION -j$(nproc) serial CXXFLAGS="-O -std=c++11"; }; } && export MFEM_DIR=$PWD/$MFEM_VERSION && cd libCEED + - echo "-------------- MFEM ----------------" && make -C $MFEM_DIR info + - make -k -j$((NPROC_GPU / NPROC_POOL)) BACKENDS="$BACKENDS_GPU" JUNIT_BATCH="cuda" junit search=mfem + # -- Nek5000 v19.0 + - export COVERAGE=0 + - cd .. && export NEK5K_VERSION=Nek5000-19.0 && { [[ -d $NEK5K_VERSION ]] || { git clone --depth 1 --branch v19.0 https://github.com/Nek5000/Nek5000.git $NEK5K_VERSION && cd $NEK5K_VERSION/tools && ./maketools genbox genmap reatore2 && cd ../..; }; } && export NEK5K_DIR=$PWD/$NEK5K_VERSION && export PATH=$NEK5K_DIR/bin:$PATH MPI=0 && cd libCEED + - echo "-------------- Nek5000 -------------" && git -C $NEK5K_DIR describe --tags + - export NPROC_POOL=1 + - make -k -j$NPROC_GPU BACKENDS="$BACKENDS_GPU" JUNIT_BATCH="cuda" junit search=nek NEK5K_DIR=$NEK5K_DIR + # -- deal.II 8bd5c262f13e15793aa206b6eed8774a9b25ce11 + - export DEAL_II_ROOT_DIR=/projects/dealii DEAL_II_DIR=/projects/dealii/install + - echo "-------------- deal.II -------------" && git -C $DEAL_II_ROOT_DIR -c safe.directory=$DEAL_II_ROOT_DIR describe --always + - make -k -j$((NPROC_GPU / NPROC_POOL)) BACKENDS="$BACKENDS_GPU" JUNIT_BATCH="cuda" junit search=dealii DEAL_II_DIR=$DEAL_II_DIR + # Clang-tidy - echo "-------------- clang-tidy ----------" && clang-tidy --version - TIDY_OPTS="-fix-errors" make -j$NPROC_CPU tidy && git diff --color=always --exit-code -# Report status + # Report status - touch .SUCCESS after_script: - | if [ -f .SUCCESS ]; then - lcov --directory . --capture --output-file coverage.info; + lcov --directory . --capture --output-file coverage.info --ignore-errors source,mismatch,unused --substitute 's#(t*-f.h)#test/(t*.-f.h)#g'; bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F interface; bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F gallery; bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F backends; @@ -224,13 +307,78 @@ noether-cuda: # ---------------------------------------------------------------------------------------- # ROCm backends # ---------------------------------------------------------------------------------------- + +# ROCm test execution currently disabled + +#noether-rocm: +# stage: test:gpu-and-float +# tags: +# - rocm +# interruptible: true +# before_script: +# # Environment +# - export COVERAGE=1 CC=gcc CXX=g++ FC=gfortran HIPCC=hipcc +# - export NPROC_POOL=4 +# - echo "-------------- nproc ---------------" && NPROC_CPU=$(nproc) && NPROC_GPU=$(($(nproc)<8?$(nproc):8)) && echo "NPROC_CPU" $NPROC_CPU && echo "NPROC_GPU" $NPROC_GPU +# - echo "-------------- CC ------------------" && $CC --version +# - echo "-------------- CXX -----------------" && $CXX --version +# - echo "-------------- FC ------------------" && $FC --version +# - echo "-------------- HIPCC ---------------" && $HIPCC --version +# - echo "-------------- GCOV ----------------" && gcov --version +# # Libraries for backends +# # -- MAGMA from dev branch +# - echo "-------------- MAGMA ---------------" +# - export MAGMA_DIR=/projects/hipMAGMA && git -C $MAGMA_DIR -c safe.directory=$MAGMA_DIR describe +# script: +# - rm -f .SUCCESS +# # libCEED +# - make configure ROCM_DIR=/opt/rocm-6.3.0 OPT='-g -O0 -fno-inline -march=native -ffp-contract=fast' +# - BACKENDS_CPU=$(make info-backends-all | grep -o '/cpu[^ ]*' | tr '\n' ' ') && BACKENDS_GPU=$(make info-backends | grep -o '/gpu[^ ]*' | tr '\n' ' ') +# - echo "-------------- libCEED -------------" && make info +# - echo "-------------- BACKENDS_GPU --------" && echo $BACKENDS_GPU +# - make clean +# - make -j$NPROC_CPU +# # -- libCEED only tests +# - echo "-------------- core tests ----------" +# - echo '[{"subject":"/","metrics":[{"name":"Transfer Size (KB)","value":"19.5","desiredSize":"smaller"},{"name":"Speed Index","value":0,"desiredSize":"smaller"},{"name":"Total Score","value":92,"desiredSize":"larger"},{"name":"Requests","value":4,"desiredSize":"smaller"}]}]' > performance.json +# # Note: PETSC_DIR is set by default in GitLab runner env, unsetting to isolate core tests +# - export PETSC_DIR= PETSC_ARCH= +# - make -k -j$((NPROC_GPU / NPROC_POOL)) BACKENDS="$BACKENDS_GPU" JUNIT_BATCH="hip" junit realsearch=% +# # Libraries for examples +# # -- PETSc with HIP (minimal) +# - export PETSC_DIR=/projects/petsc PETSC_ARCH=mpich-hip && git -C $PETSC_DIR -c safe.directory=$PETSC_DIR describe +# - echo "-------------- PETSc ---------------" && make -C $PETSC_DIR info +# - make -k -j$((NPROC_GPU / NPROC_POOL)) BACKENDS="$BACKENDS_GPU" JUNIT_BATCH="hip" junit search="petsc fluids solids" +# # Clang-tidy +# - echo "-------------- clang-tidy ----------" && clang-tidy --version +# - TIDY_OPTS="-fix-errors" make -j$NPROC_CPU tidy && git diff --color=always --exit-code +# # Report status +# - touch .SUCCESS +# after_script: +# - | +# if [ -f .SUCCESS ]; then +# lcov --directory . --capture --output-file coverage.info --ignore-errors source,mismatch,unused --substitute 's#(t*-f.h)#test/(t*.-f.h)#g'; +# bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F interface; +# bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F gallery; +# bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F backends; +# bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F tests; +# bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F examples; +# fi +# artifacts: +# paths: +# - build/*.junit +# reports: +# junit: build/*.junit +# performance: performance.json + + noether-rocm: stage: test:gpu-and-float tags: - rocm interruptible: true before_script: -# Environment + # Environment - export COVERAGE=1 CC=gcc CXX=g++ FC=gfortran HIPCC=hipcc - export NPROC_POOL=4 - echo "-------------- nproc ---------------" && NPROC_CPU=$(nproc) && NPROC_GPU=$(($(nproc)<8?$(nproc):8)) && echo "NPROC_CPU" $NPROC_CPU && echo "NPROC_GPU" $NPROC_GPU @@ -239,117 +387,80 @@ noether-rocm: - echo "-------------- FC ------------------" && $FC --version - echo "-------------- HIPCC ---------------" && $HIPCC --version - echo "-------------- GCOV ----------------" && gcov --version -# Libraries for backends -# -- MAGMA from dev branch + # Libraries for backends + # -- MAGMA from dev branch - echo "-------------- MAGMA ---------------" - export MAGMA_DIR=/projects/hipMAGMA && git -C $MAGMA_DIR -c safe.directory=$MAGMA_DIR describe script: - rm -f .SUCCESS -# libCEED - - make configure ROCM_DIR=/opt/rocm-5.6.0 OPT='-O -march=native -ffp-contract=fast' + # libCEED + - make configure ROCM_DIR=/opt/rocm-6.3.0 OPT='-g -O0 -fno-inline -march=native -ffp-contract=fast' - BACKENDS_CPU=$(make info-backends-all | grep -o '/cpu[^ ]*' | tr '\n' ' ') && BACKENDS_GPU=$(make info-backends | grep -o '/gpu[^ ]*' | tr '\n' ' ') - echo "-------------- libCEED -------------" && make info - echo "-------------- BACKENDS_GPU --------" && echo $BACKENDS_GPU + - make clean - make -j$NPROC_CPU -# -- libCEED only tests - - echo "-------------- core tests ----------" - - echo '[{"subject":"/","metrics":[{"name":"Transfer Size (KB)","value":"19.5","desiredSize":"smaller"},{"name":"Speed Index","value":0,"desiredSize":"smaller"},{"name":"Total Score","value":92,"desiredSize":"larger"},{"name":"Requests","value":4,"desiredSize":"smaller"}]}]' > performance.json -# Note: PETSC_DIR is set by default in GitLab runner env, unsetting to isolate core tests - - export PETSC_DIR= PETSC_ARCH= - - make -k -j$((NPROC_GPU / NPROC_POOL)) BACKENDS="$BACKENDS_GPU" JUNIT_BATCH="hip" junit realsearch=% -# Libraries for examples -# -- PETSc with HIP (minimal) - - export PETSC_DIR=/projects/petsc PETSC_ARCH=mpich-hip && git -C $PETSC_DIR -c safe.directory=$PETSC_DIR describe - - echo "-------------- PETSc ---------------" && make -C $PETSC_DIR info - - make -k -j$((NPROC_GPU / NPROC_POOL)) BACKENDS="$BACKENDS_GPU" JUNIT_BATCH="hip" junit search="petsc fluids solids" -# -- MFEM v4.6 - - cd .. && export MFEM_VERSION=mfem-4.6 && { [[ -d $MFEM_VERSION ]] || { git clone --depth 1 --branch v4.6 https://github.com/mfem/mfem.git $MFEM_VERSION && make -C $MFEM_VERSION -j$(nproc) serial CXXFLAGS="-O -std=c++11"; }; } && export MFEM_DIR=$PWD/$MFEM_VERSION && cd libCEED - - echo "-------------- MFEM ----------------" && make -C $MFEM_DIR info - - make -k -j$((NPROC_GPU / NPROC_POOL)) BACKENDS="$BACKENDS_GPU" JUNIT_BATCH="hip" junit search=mfem -# -- Nek5000 v19.0 - - export COVERAGE=0 - - cd .. && export NEK5K_VERSION=Nek5000-19.0 && { [[ -d $NEK5K_VERSION ]] || { git clone --depth 1 --branch v19.0 https://github.com/Nek5000/Nek5000.git $NEK5K_VERSION && cd $NEK5K_VERSION/tools && ./maketools genbox genmap reatore2 && cd ../..; }; } && export NEK5K_DIR=$PWD/$NEK5K_VERSION && export PATH=$NEK5K_DIR/bin:$PATH MPI=0 && cd libCEED - - echo "-------------- Nek5000 -------------" && git -C $NEK5K_DIR describe --tags - - export NPROC_POOL=1 - - make -k -j$NPROC_GPU BACKENDS="$BACKENDS_GPU" JUNIT_BATCH="hip" junit search=nek NEK5K_DIR=$NEK5K_DIR -# -- deal.II 8bd5c262f13e15793aa206b6eed8774a9b25ce11 - - export DEAL_II_ROOT_DIR=/projects/dealii DEAL_II_DIR=/projects/dealii/install - - echo "-------------- deal.II -------------" && git -C $DEAL_II_ROOT_DIR -c safe.directory=$DEAL_II_ROOT_DIR describe --always - - make -k -j$((NPROC_GPU / NPROC_POOL)) BACKENDS="$BACKENDS_GPU" JUNIT_BATCH="hip" junit search=dealii DEAL_II_DIR=$DEAL_II_DIR -# Clang-tidy + # Clang-tidy - echo "-------------- clang-tidy ----------" && clang-tidy --version + - make clean - TIDY_OPTS="-fix-errors" make -j$NPROC_CPU tidy && git diff --color=always --exit-code -# Report status + # Report status - touch .SUCCESS - after_script: - - | - if [ -f .SUCCESS ]; then - lcov --directory . --capture --output-file coverage.info; - bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F interface; - bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F gallery; - bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F backends; - bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F tests; - bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F examples; - fi - artifacts: - paths: - - build/*.junit - reports: - junit: build/*.junit - performance: performance.json # ---------------------------------------------------------------------------------------- -# CPU + ROCm backends with CeedScalar == float (32 bit) +# CPU + CUDA backends with CeedScalar == float (32 bit) # ---------------------------------------------------------------------------------------- noether-float: stage: test:gpu-and-float tags: - cpu - - rocm + - cuda interruptible: true before_script: -# Environment - - export COVERAGE=1 CC=gcc CXX=g++ FC= HIPCC=hipcc + # Environment + - export COVERAGE=1 CC=gcc CXX=g++ FC= NVCC=nvcc - export NPROC_POOL=8 - echo "-------------- nproc ---------------" && NPROC_CPU=$(nproc) && NPROC_GPU=$(($(nproc)<8?$(nproc):8)) && echo "NPROC_CPU" $NPROC_CPU && echo "NPROC_GPU" $NPROC_GPU - echo "-------------- CC ------------------" && $CC --version - echo "-------------- CXX -----------------" && $CXX --version - - echo "-------------- HIPCC ---------------" && $HIPCC --version + - echo "-------------- NVCC ----------------" && $NVCC --version - echo "-------------- GCOV ----------------" && gcov --version -# Libraries for backends + # Libraries for backends +# ROCm tests currently disabled # -- MAGMA from dev branch - - echo "-------------- MAGMA ---------------" - - export MAGMA_DIR=/projects/hipMAGMA && git -C $MAGMA_DIR -c safe.directory=$MAGMA_DIR describe -# -- LIBXSMM 2c145a109b5a8ad4e15f60ea42a86b9056bdc8b8 - - cd .. && export XSMM_HASH=2c145a109b5a8ad4e15f60ea42a86b9056bdc8b8 && { [[ -d libxsmm-$XSMM_HASH ]] || { curl -L https://github.com/libxsmm/libxsmm/archive/$XSMM_HASH.tar.gz -o xsmm.tar.gz && tar zvxf xsmm.tar.gz && rm xsmm.tar.gz && make -C libxsmm-$XSMM_HASH -j$(nproc); }; } && export XSMM_DIR=$PWD/libxsmm-$XSMM_HASH && cd libCEED +# - echo "-------------- MAGMA ---------------" +# - export MAGMA_DIR=/projects/hipMAGMA && git -C $MAGMA_DIR -c safe.directory=$MAGMA_DIR describe + # -- LIBXSMM 19 March 2025 + - cd .. && export XSMM_HASH=ba9d6bc69c421c10f0597d582ea1ace6a6126308 && { [[ -d libxsmm-$XSMM_HASH ]] || { curl -L https://github.com/libxsmm/libxsmm/archive/$XSMM_HASH.tar.gz -o xsmm.tar.gz && tar zvxf xsmm.tar.gz && rm xsmm.tar.gz && make -C libxsmm-$XSMM_HASH -j$(nproc); }; } && export XSMM_DIR=$PWD/libxsmm-$XSMM_HASH && cd libCEED - echo "-------------- LIBXSMM -------------" && basename $XSMM_DIR script: - rm -f .SUCCESS -# libCEED -# Change to single precision + # libCEED + # Change to single precision - sed -i 's/ceed-f64/ceed-f32/1' include/ceed/types.h -# Build libCEED - - make configure ROCM_DIR=/opt/rocm-5.6.0 OPT='-O -march=native -ffp-contract=fast' + # Build libCEED + - make configure OPT='-g -O0 -fno-inline -march=native -ffp-contract=fast' CUDA_DIR=/usr/local/cuda-12.9 - BACKENDS_CPU=$(make info-backends-all | grep -o '/cpu[^ ]*' | tr '\n' ' ') && BACKENDS_GPU=$(make info-backends | grep -o '/gpu[^ ]*' | tr '\n' ' ') - echo "-------------- libCEED -------------" && make info - echo "-------------- BACKENDS_CPU --------" && echo $BACKENDS_CPU - echo "-------------- BACKENDS_GPU --------" && echo $BACKENDS_GPU + - make clean - make -j$NPROC_CPU -# -- libCEED only tests + # -- libCEED only tests - echo "-------------- core tests ----------" - echo '[{"subject":"/","metrics":[{"name":"Transfer Size (KB)","value":"19.5","desiredSize":"smaller"},{"name":"Speed Index","value":0,"desiredSize":"smaller"},{"name":"Total Score","value":92,"desiredSize":"larger"},{"name":"Requests","value":4,"desiredSize":"smaller"}]}]' > performance.json -# Note: PETSC_DIR is set by default in GitLab runner env, unsetting to isolate core tests + # Note: PETSC_DIR is set by default in GitLab runner env, unsetting to isolate core tests - export PETSC_DIR= PETSC_ARCH= - make -k -j$((NPROC_CPU / NPROC_POOL)) BACKENDS="$BACKENDS_CPU" JUNIT_BATCH="float-cpu" junit realsearch=% - export NPROC_POOL=4 - - make -k -j$((NPROC_GPU / NPROC_POOL)) BACKENDS="$BACKENDS_GPU" JUNIT_BATCH="float-hip" junit realsearch=% -# Report status + - make -k -j$((NPROC_GPU / NPROC_POOL)) BACKENDS="$BACKENDS_GPU" JUNIT_BATCH="float-cuda" junit realsearch=% + # Report status - echo "SUCCESS" > .job_status after_script: - | if [ $(cat .job_status) == "SUCCESS" ]; then - lcov --directory . --capture --output-file coverage.info; + lcov --directory . --capture --output-file coverage.info --ignore-errors source,mismatch,unused --substitute 's#(t*-f.h)#test/(t*.-f.h)#g'; bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F interface; bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F gallery; bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F backends; diff --git a/.mailmap b/.mailmap index 61c3cab2b6..787aaa2c49 100644 --- a/.mailmap +++ b/.mailmap @@ -6,7 +6,11 @@ # # See 'git help shortlog' for details +Adeleke Bankole +Adeleke Bankole <86932837+AdelekeBankole@users.noreply.github.com> Ahmad Abdelfattah <36712794+abdelfattah83@users.noreply.github.com> +Allen MacFarland <79958059+SirAlienTheGreat@users.noreply.github.com> +Alex Pedersen <54287657+ajpedersen20@users.noreply.github.com> Arash Mehraban David Medina James Wright @@ -16,13 +20,21 @@ Jeremy L. Thompson <25011573+jeremylt@users Jeremy L. Thompson Jeremy L. Thompson Jeremy L. Thompson -Leila Ghaffari <49916147+LeilaGhaffari@users.noreply.github.com> -Leila Ghaffari +Kenneth E. Jansen +Layla Ghaffari +Layla Ghaffari <49916147+LeilaGhaffari@users.noreply.github.com> +Layla Ghaffari +Natalie Beams Natalie Beams <246972+nbeams@users.noreply.github.com> Rey Koki <36133157+reykoki@users.noreply.github.com> Rezgar Shakeri <42816410+rezgarshakeri@users.noreply.github.com> +Rezgar Shakeri +Riccardo Balin +Riccardo Balin +Thilina Ratnayaka Thilina Ratnayaka Tzanio Kolev +Umesh Unnikrishnan Valeria Barra Valeria Barra <39932030+valeriabarra@users.noreply.github.com> Valeria Barra @@ -31,3 +43,6 @@ Valeria Barra <11493037+pazner@users.noreply.github.com> Yohann Dudouit Yohann Dudouit +Zach Atkins +Zach Atkins +Zach Atkins diff --git a/.readthedocs.yml b/.readthedocs.yml index b530328523..2b173bda47 100644 --- a/.readthedocs.yml +++ b/.readthedocs.yml @@ -5,14 +5,15 @@ version: 2 build: - os: ubuntu-22.04 + os: ubuntu-24.04 tools: - python: "3.11" - nodejs: "19" + python: "3.13" + nodejs: "23" apt_packages: - librsvg2-bin jobs: post_create_environment: + - npx playwright install - npm install -g @mermaid-js/mermaid-cli # Build documentation in the docs/ directory with Sphinx diff --git a/AUTHORS b/AUTHORS index 8c6e400008..adc091e7a1 100644 --- a/AUTHORS +++ b/AUTHORS @@ -1,11 +1,12 @@ Ahmad Abdelfattah +Zachary R. Atkins Valeria Barra Natalie Beams Jed Brown Jean-Sylvain Camier Veselin Dobrev Yohann Dudouit -Leila Ghaffari +Layla Ghaffari Sebastian Grimberg Tzanio Kolev David Medina diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md index 01e35d0de3..2f48726d3e 100644 --- a/CODE_OF_CONDUCT.md +++ b/CODE_OF_CONDUCT.md @@ -37,7 +37,7 @@ Examples of representing our community include using an official e-mail address, ## Enforcement -Instances of abusive, harassing, or otherwise unacceptable behavior may be reported to the community leaders responsible for enforcement at jed@jedbrown.org, valeria@caltech.edu, or tzanio@llnl.gov. +Instances of abusive, harassing, or otherwise unacceptable behavior may be reported to the community leaders responsible for enforcement at jed@jedbrown.org, vbarra@sdsu.edu, or tzanio@llnl.gov. All complaints will be reviewed and investigated promptly and fairly. All community leaders are obligated to respect the privacy and security of the reporter of any incident. diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index d3eeeb5c03..5f0cdbf4fd 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -44,6 +44,18 @@ By making a contribution to this project, I certify that: (d) I understand and agree that this project and the contribution are public and that a record of the contribution (including all personal information I submit with it, including my sign-off) is maintained indefinitely and may be redistributed consistent with this project or the open source license(s) involved. +## LLM Generated Content + +libCEED is a research software project, and we require citation of the origin of ideas in the same way that citations are expected for research papers. +See the [San Francisco Declaration on Research Assessment](https://sfdora.org/read) for discussion on treating other research outputs, such as datasets and software, as first class artifacts like research papers. + +LLM/GenAI generated code can contain novel algorithms developed by other researchers and replicated without attribution. +As such, we cannot accept pull requests containing code predominantly generated by LLM/GenAI. + +LLMs may be used to aid the development of code for pull requests (PR); however, the individual submitting the PR must certify to its contents as described by Developer's Certificate of Origin. +The human creating the PR is ultimately responsible for the content in the PR. +PRs must disclose and describe all LLM usage. + ## Authorship libCEED contains components authored by many individuals. diff --git a/Cargo.toml b/Cargo.toml index a987ca8a95..83aaac7b46 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,10 +1,13 @@ [workspace] members = [ - "rust/libceed", - "rust/libceed-sys", - "examples/rust/ex1-volume", - "examples/rust/ex2-surface", - "examples/rust/ex3-vector-volume", - "examples/rust/ex4-vector-surface", - "examples/rust/mesh", + "rust/libceed", + "rust/libceed-sys", + "examples/rust/ex1-volume", + "examples/rust/ex1-volume-vector", + "examples/rust/ex2-surface", + "examples/rust/ex2-surface-vector", + "examples/rust/ex3-volume", + "examples/rust/ex3-volume-vector", + "examples/rust/mesh", ] +exclude = ["examples/rust-qfunctions/ex1-volume-rs"] diff --git a/LICENSE b/LICENSE index ec06a37c93..85888e282b 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ BSD 2-Clause License -Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/Makefile b/Makefile index 69f05b0673..f824ba2cd8 100644 --- a/Makefile +++ b/Makefile @@ -1,15 +1,64 @@ -# Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. # All Rights Reserved. See the top-level LICENSE and NOTICE files for details. # # SPDX-License-Identifier: BSD-2-Clause # # This file is part of CEED: http://github.com/ceed +# ------------------------------------------------------------ +# Configuration +# ------------------------------------------------------------ + +# config.mk stores cached configuration variables CONFIG ?= config.mk -include $(CONFIG) + +# common.mk holds definitions used in various makefiles throughout the project COMMON ?= common.mk -include $(COMMON) +# Quiet, color output +quiet ?= $($(1)) + +# Cancel built-in and old-fashioned implicit rules which we don't use +.SUFFIXES: + +.SECONDEXPANSION: # to expand $$(@D)/.DIR + +%/.DIR : + @mkdir -p $(@D) + @touch $@ + +.PRECIOUS: %/.DIR + + +DARWIN := $(filter Darwin,$(shell uname -s)) + + +# ------------------------------------------------------------ +# Root directories for backend dependencies +# ------------------------------------------------------------ + +# XSMM_DIR env variable should point to XSMM main (github.com/hfp/libxsmm) +XSMM_DIR ?= ../libxsmm + +# Often /opt/cuda or /usr/local/cuda, but sometimes present on machines that don't support CUDA +CUDA_DIR ?= +CUDA_ARCH ?= + +# Often /opt/rocm, but sometimes present on machines that don't support HIP +ROCM_DIR ?= +HIP_ARCH ?= + +# env variable MAGMA_DIR can be used too +MAGMA_DIR ?= ../magma + + +# ------------------------------------------------------------ +# Compiler flags +# ------------------------------------------------------------ + +# Detect user compiler options and set defaults ifeq (,$(filter-out undefined default,$(origin CC))) CC = gcc endif @@ -26,8 +75,11 @@ ifeq (,$(filter-out undefined default,$(origin AR))) AR = ar endif ifeq (,$(filter-out undefined default,$(origin ARFLAGS))) - ARFLAGS = crD + ARFLAGS = $(if $(DARWIN),cr,crD) endif +# Often /opt/rocm, but sometimes present on machines that don't support HIP +ROCM_DIR ?= ${HIP_DIR} +HIP_ARCH ?= NVCC ?= $(CUDA_DIR)/bin/nvcc NVCC_CXX ?= $(CXX) HIPCC ?= $(ROCM_DIR)/bin/hipcc @@ -39,6 +91,17 @@ ifneq ($(EMSCRIPTEN),) EM_LDFLAGS = -s TOTAL_MEMORY=256MB endif +HIP_CONFIG_RES := $(shell $(ROCM_DIR)/bin/hipconfig) +ifneq (,$(findstring __HIP_PLATFORM_SPIRV__,$(HIP_CONFIG_RES))) + HIP_LIB_NAME = CHIP +else ifneq (,$(findstring __HIP_PLATFORM_HCC__,$(HIP_CONFIG_RES))) + HIP_LIB_NAME = amdhip64 +else ifneq (,$(findstring __HIP_PLATFORM_AMD__,$(HIP_CONFIG_RES))) + HIP_LIB_NAME = amdhip64 +else + $(error "HIP platform not supported") +endif + # ASAN must be left empty if you don't want to use it ASAN ?= @@ -47,69 +110,28 @@ ASAN ?= # if any. If the user sets CEED_LDFLAGS or CEED_LDLIBS, they are used *instead # of* what we populate here (thus that's advanced usage and not recommended). CEED_LDFLAGS ?= -CEED_LDLIBS ?= +CEED_LDLIBS ?= UNDERSCORE ?= 1 # Verbose mode, V or VERBOSE V ?= $(VERBOSE) -# MFEM_DIR env variable should point to sibling directory -ifneq ($(wildcard ../mfem/libmfem.*),) - MFEM_DIR ?= ../mfem -endif - -# NEK5K_DIR env variable should point to sibling directory -ifneq ($(wildcard ../Nek5000/*),) - NEK5K_DIR ?= $(abspath ../Nek5000) -endif -export NEK5K_DIR -MPI ?= 1 - -# DEAL_II_DIR env variable should point to sibling directory -ifneq ($(wildcard ../dealii/install/lib/libdeal_II.*),) - DEAL_II_DIR ?= ../dealii/install -endif -export DEAL_II_DIR - -# CEED_DIR env for NEK5K testing -export CEED_DIR = $(abspath .) - -# XSMM_DIR env variable should point to XSMM main (github.com/hfp/libxsmm) -XSMM_DIR ?= ../libxsmm - -# OCCA_DIR env variable should point to OCCA main (github.com/libocca/occa) -OCCA_DIR ?= ../occa/install - -# env variable MAGMA_DIR can be used too -MAGMA_DIR ?= ../magma - -# Often /opt/cuda or /usr/local/cuda, but sometimes present on machines that don't support CUDA -CUDA_DIR ?= -CUDA_ARCH ?= - -# Often /opt/rocm, but sometimes present on machines that don't support HIP -ROCM_DIR ?= -HIP_ARCH ?= - -# Check for PETSc in ../petsc -ifneq ($(wildcard ../petsc/lib/libpetsc.*),) - PETSC_DIR ?= ../petsc -endif - -# SmartSim testing -SMARTREDIS_DIR ?= - -# Warning: SANTIZ options still don't run with /gpu/occa +# SANTIZ options AFLAGS ?= -fsanitize=address #-fsanitize=undefined -fno-omit-frame-pointer # Note: Intel oneAPI C/C++ compiler is now icx/icpx -CC_VENDOR := $(firstword $(filter gcc (GCC) clang icc icc_orig oneAPI XL emcc,$(subst -, ,$(shell $(CC) --version)))) +CC_VENDOR := $(firstword $(filter gcc (GCC) clang cc icc icc_orig oneAPI XL emcc,$(subst -, ,$(shell $(CC) --version)))) CC_VENDOR := $(subst (GCC),gcc,$(subst icc_orig,icc,$(CC_VENDOR))) +CC_VENDOR := $(if $(filter cc,$(CC_VENDOR)),gcc,$(CC_VENDOR)) FC_VENDOR := $(if $(FC),$(firstword $(filter GNU ifort ifx XL,$(shell $(FC) --version 2>&1 || $(FC) -qversion)))) +# Host architecture for setting appropriate flags +UNAME_M := $(shell uname -m) + # Default extra flags by vendor -MARCHFLAG.gcc := -march=native +# GCC: use -march=native only on x86 (where -mcpu doesn't exist); use -mcpu=native elsewhere +MARCHFLAG.gcc := $(if $(filter x86_64 i%86,$(UNAME_M)),-march=native,-mcpu=native) MARCHFLAG.clang := $(MARCHFLAG.gcc) MARCHFLAG.icc := MARCHFLAG.oneAPI := $(MARCHFLAG.clang) @@ -130,7 +152,7 @@ OPT.clang := $(OPT.gcc) OPT.icc := $(OPT.gcc) OPT.oneAPI := $(OPT.clang) OPT.emcc := -CFLAGS.gcc := $(if $(STATIC),,-fPIC) -std=c99 -Wall -Wextra -Wno-unused-parameter -MMD -MP +CFLAGS.gcc := $(if $(STATIC),,-fPIC) -std=c11 -Wall -Wextra -Wno-unused-parameter -MMD -MP CFLAGS.clang := $(CFLAGS.gcc) CFLAGS.icc := $(CFLAGS.gcc) CFLAGS.oneAPI := $(CFLAGS.clang) @@ -162,18 +184,19 @@ OMP_SIMD_FLAG := $(if $(call cc_check_flag,$(OMP_SIMD_FLAG)),$(OMP_SIMD_FLAG)) PEDANTIC ?= PEDANTICFLAGS ?= -Werror -pedantic +# Compiler flags OPT ?= -O $(MARCHFLAG) $(OPT.$(CC_VENDOR)) $(OMP_SIMD_FLAG) CFLAGS ?= $(OPT) $(CFLAGS.$(CC_VENDOR)) $(if $(PEDANTIC),$(PEDANTICFLAGS)) CXXFLAGS ?= $(OPT) $(CXXFLAGS.$(CC_VENDOR)) $(if $(PEDANTIC),$(PEDANTICFLAGS)) FFLAGS ?= $(OPT) $(FFLAGS.$(FC_VENDOR)) LIBCXX ?= -lstdc++ -NVCCFLAGS ?= -ccbin $(CXX) -Xcompiler "$(OPT)" -Xcompiler -fPIC +NVCCFLAGS ?= -ccbin $(CXX) -Xcompiler '$(OPT)' -Xcompiler -fPIC ifneq ($(CUDA_ARCH),) NVCCFLAGS += -arch=$(CUDA_ARCH) endif HIPCCFLAGS ?= $(filter-out $(OMP_SIMD_FLAG),$(OPT)) -fPIC -munsafe-fp-atomics ifneq ($(HIP_ARCH),) - HIPCCFLAGS += --amdgpu-target=$(HIP_ARCH) + HIPCCFLAGS += --offload-arch=$(HIP_ARCH) endif SYCL_FLAG := $(SYCL_FLAG.$(CC_VENDOR)) SYCLFLAGS ?= $(SYCL_FLAG) -fPIC -std=c++17 $(filter-out -std=c++11,$(CXXFLAGS)) $(filter-out $(OMP_SIMD_FLAG),$(OPT)) @@ -201,7 +224,6 @@ OBJDIR := build for_install := $(filter install,$(MAKECMDGOALS)) LIBDIR := $(if $(for_install),$(OBJDIR),lib) - # Installation variables prefix ?= /usr/local bindir = $(prefix)/bin @@ -221,7 +243,6 @@ MFLAGS := -j $(NPROCS) --warn-undefined-variables \ PYTHON ?= python3 PROVE ?= prove PROVE_OPTS ?= -j $(NPROCS) -DARWIN := $(filter Darwin,$(shell uname -s)) SO_EXT := $(if $(DARWIN),dylib,so) ceed.pc := $(LIBDIR)/pkgconfig/ceed.pc @@ -229,64 +250,80 @@ libceed.so := $(LIBDIR)/libceed.$(SO_EXT) libceed.a := $(LIBDIR)/libceed.a libceed := $(if $(STATIC),$(libceed.a),$(libceed.so)) CEED_LIBS = -lceed -libceed.c := $(filter-out interface/ceed-cuda.c interface/ceed-hip.c interface/ceed-jit-source-root-$(if $(for_install),default,install).c, $(wildcard interface/ceed*.c backends/*.c gallery/*.c)) -gallery.c := $(wildcard gallery/*/ceed*.c) -libceed.c += $(gallery.c) libceeds = $(libceed) BACKENDS_BUILTIN := /cpu/self/ref/serial /cpu/self/ref/blocked /cpu/self/opt/serial /cpu/self/opt/blocked BACKENDS_MAKE := $(BACKENDS_BUILTIN) -TEST_BACKENDS := /cpu/self/tmpl /cpu/self/tmpl/sub -# Tests -tests.c := $(sort $(wildcard tests/t[0-9][0-9][0-9]-*.c)) -tests.f := $(if $(FC),$(sort $(wildcard tests/t[0-9][0-9][0-9]-*.f90))) -tests := $(tests.c:tests/%.c=$(OBJDIR)/%$(EXE_SUFFIX)) -ctests := $(tests) -tests += $(tests.f:tests/%.f90=$(OBJDIR)/%$(EXE_SUFFIX)) -# Examples -examples.c := $(sort $(wildcard examples/ceed/*.c)) -examples.f := $(if $(FC),$(sort $(wildcard examples/ceed/*.f))) -examples := $(examples.c:examples/ceed/%.c=$(OBJDIR)/%$(EXE_SUFFIX)) -examples += $(examples.f:examples/ceed/%.f=$(OBJDIR)/%$(EXE_SUFFIX)) -# MFEM Examples -mfemexamples.cpp := $(sort $(wildcard examples/mfem/*.cpp)) -mfemexamples := $(mfemexamples.cpp:examples/mfem/%.cpp=$(OBJDIR)/mfem-%) -# Nek5K Examples -nekexamples := $(OBJDIR)/nek-bps -# PETSc Examples -petscexamples.c := $(wildcard examples/petsc/*.c) -petscexamples := $(petscexamples.c:examples/petsc/%.c=$(OBJDIR)/petsc-%) -# deal.II Examples -dealiiexamples := $(OBJDIR)/dealii-bps -# Fluid Dynamics Examples -fluidsexamples.c := $(sort $(wildcard examples/fluids/*.c)) -fluidsexamples.py := examples/fluids/smartsim_regression_framework.py -fluidsexamples := $(fluidsexamples.c:examples/fluids/%.c=$(OBJDIR)/fluids-%) -fluidsexamples += $(fluidsexamples.py:examples/fluids/%.py=$(OBJDIR)/fluids-py-%) -# Solid Mechanics Examples -solidsexamples.c := $(sort $(wildcard examples/solids/*.c)) -solidsexamples := $(solidsexamples.c:examples/solids/%.c=$(OBJDIR)/solids-%) - -# Backends/[ref, blocked, memcheck, opt, avx, occa, magma] + +# ------------------------------------------------------------ +# Root directories for examples using external libraries +# ------------------------------------------------------------ + +# DEAL_II_DIR env variable should point to sibling directory +ifneq ($(wildcard ../dealii/install/lib/libdeal_II.*),) + DEAL_II_DIR ?= ../dealii/install +endif +# Export for deal.II testing +export DEAL_II_DIR + +# MFEM_DIR env variable should point to sibling directory +ifneq ($(wildcard ../mfem/libmfem.*),) + MFEM_DIR ?= ../mfem +endif + +# NEK5K_DIR env variable should point to sibling directory +ifneq ($(wildcard ../Nek5000/*),) + NEK5K_DIR ?= $(abspath ../Nek5000) +endif +# Exports for NEK5K testing +export CEED_DIR = $(abspath .) +export NEK5K_DIR +MPI ?= 1 + +# Check for PETSc in ../petsc +ifneq ($(wildcard ../petsc/lib/libpetsc.*),) + PETSC_DIR ?= ../petsc +endif + +# ------------------------------------------------------------ +# Build the library (default target) +# ------------------------------------------------------------ + +lib: $(libceed) $(ceed.pc) +# run 'lib' target in parallel +par:;@$(MAKE) $(MFLAGS) V=$(V) lib + +$(libceed.so) : CEED_LDFLAGS += $(if $(DARWIN), -install_name @rpath/$(notdir $(libceed.so))) + +# ------------------------------------------------------------ +# Source files +# ------------------------------------------------------------ + +# Interface and gallery +libceed.c := $(filter-out interface/ceed-cuda.c interface/ceed-hip.c interface/ceed-jit-source-root-$(if $(for_install),default,install).c, $(wildcard interface/ceed*.c backends/weak/*.c gallery/*.c)) +gallery.c := $(wildcard gallery/*/ceed*.c) +libceed.c += $(gallery.c) + +# Backends +# - CPU ref.c := $(sort $(wildcard backends/ref/*.c)) blocked.c := $(sort $(wildcard backends/blocked/*.c)) ceedmemcheck.c := $(sort $(wildcard backends/memcheck/*.c)) opt.c := $(sort $(wildcard backends/opt/*.c)) avx.c := $(sort $(wildcard backends/avx/*.c)) xsmm.c := $(sort $(wildcard backends/xsmm/*.c)) +# - GPU cuda.c := $(sort $(wildcard backends/cuda/*.c)) cuda.cpp := $(sort $(wildcard backends/cuda/*.cpp)) cuda-ref.c := $(sort $(wildcard backends/cuda-ref/*.c)) cuda-ref.cpp := $(sort $(wildcard backends/cuda-ref/*.cpp)) cuda-ref.cu := $(sort $(wildcard backends/cuda-ref/kernels/*.cu)) cuda-shared.c := $(sort $(wildcard backends/cuda-shared/*.c)) -cuda-shared.cu := $(sort $(wildcard backends/cuda-shared/kernels/*.cu)) cuda-gen.c := $(sort $(wildcard backends/cuda-gen/*.c)) cuda-gen.cpp := $(sort $(wildcard backends/cuda-gen/*.cpp)) -cuda-gen.cu := $(sort $(wildcard backends/cuda-gen/kernels/*.cu)) -occa.cpp := $(sort $(shell find backends/occa -type f -name *.cpp)) -magma.c := $(sort $(wildcard backends/magma/*.c)) -magma.cpp := $(sort $(wildcard backends/magma/*.cpp)) +cuda-all.c := interface/ceed-cuda.c $(cuda.c) $(cuda-ref.c) $(cuda-shared.c) $(cuda-gen.c) +cuda-all.cpp := $(cuda.cpp) $(cuda-ref.cpp) $(cuda-gen.cpp) +cuda-all.cu := $(cuda-ref.cu) hip.c := $(sort $(wildcard backends/hip/*.c)) hip.cpp := $(sort $(wildcard backends/hip/*.cpp)) hip-ref.c := $(sort $(wildcard backends/hip-ref/*.c)) @@ -295,34 +332,104 @@ hip-ref.hip := $(sort $(wildcard backends/hip-ref/kernels/*.hip.cpp)) hip-shared.c := $(sort $(wildcard backends/hip-shared/*.c)) hip-gen.c := $(sort $(wildcard backends/hip-gen/*.c)) hip-gen.cpp := $(sort $(wildcard backends/hip-gen/*.cpp)) +hip-all.c := interface/ceed-hip.c $(hip.c) $(hip-ref.c) $(hip-shared.c) $(hip-gen.c) +hip-all.cpp := $(hip.cpp) $(hip-ref.cpp) $(hip-gen.cpp) +hip-all.hip := $(hip-ref.hip) sycl-core.cpp := $(sort $(wildcard backends/sycl/*.sycl.cpp)) sycl-ref.cpp := $(sort $(wildcard backends/sycl-ref/*.sycl.cpp)) sycl-shared.cpp:= $(sort $(wildcard backends/sycl-shared/*.sycl.cpp)) sycl-gen.cpp := $(sort $(wildcard backends/sycl-gen/*.sycl.cpp)) +magma.c := $(sort $(wildcard backends/magma/*.c)) +magma.cpp := $(sort $(wildcard backends/magma/*.cpp)) -hip-all.c := interface/ceed-hip.c $(hip.c) $(hip-ref.c) $(hip-shared.c) $(hip-gen.c) -hip-all.cpp := $(hip.cpp) $(hip-ref.cpp) $(hip-gen.cpp) +# Tests +tests.c := $(sort $(wildcard tests/t[0-9][0-9][0-9]-*.c)) +tests.f := $(if $(FC),$(sort $(wildcard tests/t[0-9][0-9][0-9]-*.f90))) +tests := $(tests.c:tests/%.c=$(OBJDIR)/%$(EXE_SUFFIX)) +ctests := $(tests) +tests += $(tests.f:tests/%.f90=$(OBJDIR)/%$(EXE_SUFFIX)) -# Quiet, color output -quiet ?= $($(1)) +# Examples +examples.c := $(sort $(wildcard examples/ceed/*.c)) +examples.f := $(if $(FC),$(sort $(wildcard examples/ceed/*.f90))) +examples := $(examples.c:examples/ceed/%.c=$(OBJDIR)/%$(EXE_SUFFIX)) +examples += $(examples.f:examples/ceed/%.f90=$(OBJDIR)/%$(EXE_SUFFIX)) -# Cancel built-in and old-fashioned implicit rules which we don't use -.SUFFIXES: +# deal.II Examples +dealiiexamples.cc := $(sort $(wildcard examples/deal.II/*.cc)) +dealiiexamples := $(dealiiexamples.cc:examples/deal.II/%.cc=$(OBJDIR)/dealii-%) -.SECONDEXPANSION: # to expand $$(@D)/.DIR +# MFEM Examples +mfemexamples.cpp := $(sort $(wildcard examples/mfem/*.cpp)) +mfemexamples := $(mfemexamples.cpp:examples/mfem/%.cpp=$(OBJDIR)/mfem-%) -%/.DIR : - @mkdir -p $(@D) - @touch $@ +# Nek5K Examples +nekexamples := $(OBJDIR)/nek-bps -.PRECIOUS: %/.DIR +# Rust QFunction Examples +rustqfunctions.c := $(sort $(wildcard examples/rust-qfunctions/*.c)) +rustqfunctionsexamples := $(rustqfunctions.c:examples/rust-qfunctions/%.c=$(OBJDIR)/rustqfunctions-%) + +# PETSc Examples +petscexamples.c := $(wildcard examples/petsc/*.c) +petscexamples := $(petscexamples.c:examples/petsc/%.c=$(OBJDIR)/petsc-%) + +# Fluid Dynamics Example +fluidsexamples.c := $(sort $(wildcard examples/fluids/*.c)) +fluidsexamples := $(fluidsexamples.c:examples/fluids/%.c=$(OBJDIR)/fluids-%) + +# Solid Mechanics Example +solidsexamples.c := $(sort $(wildcard examples/solids/*.c)) +solidsexamples := $(solidsexamples.c:examples/solids/%.c=$(OBJDIR)/solids-%) + + +# ------------------------------------------------------------ +# View configuration options +# ------------------------------------------------------------ -lib: $(libceed) $(ceed.pc) -# run 'lib' target in parallel -par:;@$(MAKE) $(MFLAGS) V=$(V) lib backend_status = $(if $(filter $1,$(BACKENDS_MAKE)), [backends: $1], [not found]) + +info-basic: + $(info -----------------------------------------) + $(info | ___ __ ______________________ |) + $(info | / (_) /_ / ____/ ____/ ____/ __ \ |) + $(info | / / / __ \/ / / __/ / __/ / / / / |) + $(info | / / / /_/ / /___/ /___/ /___/ /_/ / |) + $(info | /_/_/_.___/\____/_____/_____/_____/ |) + $(info -----------------------------------------) + $(info ) + $(info -----------------------------------------) + $(info ) + $(info Built-in Backends:) + $(info $(BACKENDS_BUILTIN)) + $(info ) + $(info Additional Backends:) + $(info $(filter-out $(BACKENDS_BUILTIN),$(BACKENDS))) + $(info ) + $(info -----------------------------------------) + $(info ) + @true + info: - $(info ------------------------------------) + $(info -----------------------------------------) + $(info | ___ __ ______________________ |) + $(info | / (_) /_ / ____/ ____/ ____/ __ \ |) + $(info | / / / __ \/ / / __/ / __/ / / / / |) + $(info | / / / /_/ / /___/ /___/ /___/ /_/ / |) + $(info | /_/_/_.___/\____/_____/_____/_____/ |) + $(info -----------------------------------------) + $(info ) + $(info -----------------------------------------) + $(info ) + $(info Built-in Backends:) + $(info $(BACKENDS_BUILTIN)) + $(info ) + $(info Additional Backends:) + $(info $(filter-out $(BACKENDS_BUILTIN),$(BACKENDS))) + $(info ) + $(info -----------------------------------------) + $(info ) + $(info Compiler Flags:) $(info CC = $(CC)) $(info CXX = $(CXX)) $(info FC = $(FC)) @@ -341,35 +448,54 @@ info: $(info AFLAGS = $(AFLAGS)) $(info ASAN = $(or $(ASAN),(empty))) $(info VERBOSE = $(or $(V),(empty)) [verbose=$(if $(V),on,off)]) - $(info ------------------------------------) + $(info ) + $(info -----------------------------------------) + $(info ) + $(info Backend Dependencies:) $(info MEMCHK_STATUS = $(MEMCHK_STATUS)$(call backend_status,$(MEMCHK_BACKENDS))) $(info AVX_STATUS = $(AVX_STATUS)$(call backend_status,$(AVX_BACKENDS))) $(info XSMM_DIR = $(XSMM_DIR)$(call backend_status,$(XSMM_BACKENDS))) - $(info OCCA_DIR = $(OCCA_DIR)$(call backend_status,$(OCCA_BACKENDS))) - $(info MAGMA_DIR = $(MAGMA_DIR)$(call backend_status,$(MAGMA_BACKENDS))) $(info CUDA_DIR = $(CUDA_DIR)$(call backend_status,$(CUDA_BACKENDS))) $(info ROCM_DIR = $(ROCM_DIR)$(call backend_status,$(HIP_BACKENDS))) $(info SYCL_DIR = $(SYCL_DIR)$(call backend_status,$(SYCL_BACKENDS))) - $(info ------------------------------------) + $(info MAGMA_DIR = $(MAGMA_DIR)$(call backend_status,$(MAGMA_BACKENDS))) + $(info ) + $(info -----------------------------------------) + $(info ) + $(info Example Dependencies:) $(info MFEM_DIR = $(MFEM_DIR)) $(info NEK5K_DIR = $(NEK5K_DIR)) $(info PETSC_DIR = $(PETSC_DIR)) $(info DEAL_II_DIR = $(DEAL_II_DIR)) - $(info ------------------------------------) + $(info ) + $(info -----------------------------------------) + $(info ) + $(info Install Options:) $(info prefix = $(prefix)) $(info includedir = $(value includedir)) $(info libdir = $(value libdir)) $(info pkgconfigdir = $(value pkgconfigdir)) - $(info ------------------------------------) + $(info ) + $(info -----------------------------------------) + $(info ) + $(info Git:) + $(info describe = $(GIT_DESCRIBE)) + $(info ) + $(info -----------------------------------------) @true + info-backends: $(info make: 'lib' with optional backends: $(filter-out $(BACKENDS_BUILTIN),$(BACKENDS))) @true + info-backends-all: - $(info make: 'lib' with backends: $(filter-out $(TEST_BACKENDS),$(BACKENDS))) + $(info make: 'lib' with backends: $(BACKENDS)) @true -$(libceed.so) : CEED_LDFLAGS += $(if $(DARWIN), -install_name @rpath/$(notdir $(libceed.so))) + +# ------------------------------------------------------------ +# Backends +# ------------------------------------------------------------ # Standard Backends libceed.c += $(ref.c) @@ -423,22 +549,6 @@ ifneq ($(wildcard $(XSMM_DIR)/lib/libxsmm.*),) BACKENDS_MAKE += $(XSMM_BACKENDS) endif -# OCCA Backends -OCCA_BACKENDS = /cpu/self/occa -ifneq ($(wildcard $(OCCA_DIR)/lib/libocca.*),) - OCCA_MODES := $(shell LD_LIBRARY_PATH=$(OCCA_DIR)/lib $(OCCA_DIR)/bin/occa modes) - OCCA_BACKENDS += $(if $(filter OpenMP,$(OCCA_MODES)),/cpu/openmp/occa) - OCCA_BACKENDS += $(if $(filter dpcpp,$(OCCA_MODES)),/gpu/dpcpp/occa) - OCCA_BACKENDS += $(if $(filter OpenCL,$(OCCA_MODES)),/gpu/opencl/occa) - OCCA_BACKENDS += $(if $(filter HIP,$(OCCA_MODES)),/gpu/hip/occa) - OCCA_BACKENDS += $(if $(filter CUDA,$(OCCA_MODES)),/gpu/cuda/occa) - $(libceeds) : CPPFLAGS += -I$(OCCA_DIR)/include - PKG_LIBS += -L$(abspath $(OCCA_DIR))/lib -locca - LIBCEED_CONTAINS_CXX = 1 - libceed.cpp += $(occa.cpp) - BACKENDS_MAKE += $(OCCA_BACKENDS) -endif - # CUDA Backends ifneq ($(CUDA_DIR),) CUDA_LIB_DIR := $(wildcard $(foreach d,lib lib64 lib/x86_64-linux-gnu,$(CUDA_DIR)/$d/libcudart.${SO_EXT})) @@ -452,27 +562,34 @@ ifneq ($(CUDA_LIB_DIR),) PKG_STUBS_LIBS += -L$(CUDA_LIB_DIR_STUBS) LIBCEED_CONTAINS_CXX = 1 libceed.c += interface/ceed-cuda.c - libceed.c += $(cuda.c) $(cuda-ref.c) $(cuda-shared.c) $(cuda-gen.c) - libceed.cpp += $(cuda.cpp) $(cuda-ref.cpp) $(cuda-gen.cpp) - libceed.cu += $(cuda-ref.cu) $(cuda-shared.cu) $(cuda-gen.cu) + libceed.c += $(cuda-all.c) + libceed.cpp += $(cuda-all.cpp) + libceed.cu += $(cuda-all.cu) BACKENDS_MAKE += $(CUDA_BACKENDS) endif # HIP Backends -HIP_LIB_DIR := $(wildcard $(foreach d,lib lib64,$(ROCM_DIR)/$d/libamdhip64.${SO_EXT})) +HIP_LIB_DIR := $(wildcard $(foreach d,lib lib64,$(ROCM_DIR)/$d/lib${HIP_LIB_NAME}.${SO_EXT})) HIP_LIB_DIR := $(patsubst %/,%,$(dir $(firstword $(HIP_LIB_DIR)))) HIP_BACKENDS = /gpu/hip/ref /gpu/hip/shared /gpu/hip/gen ifneq ($(HIP_LIB_DIR),) - HIPCONFIG_CPPFLAGS := $(subst =,,$(shell $(ROCM_DIR)/bin/hipconfig -C)) - $(hip-all.c:%.c=$(OBJDIR)/%.o) $(hip-all.c:%=%.tidy): CPPFLAGS += $(HIPCONFIG_CPPFLAGS) + HIPCONFIG_CPPFLAGS := $(shell $(ROCM_DIR)/bin/hipconfig -C) + # chipStar hipconfig -C includes clang-only flags (--target=, --offload=, -nohipwrapperinc, --hip-path=); + # strip those out for gcc-compiled C sources, keeping -D/-I/-include flags + ifeq ($(HIP_LIB_NAME),CHIP) + HIPCONFIG_CPPFLAGS_C := $(filter-out --offload% -nohipwrapperinc --hip-path% --target%,$(HIPCONFIG_CPPFLAGS)) -I$(ROCM_DIR)/include + else + HIPCONFIG_CPPFLAGS_C := $(HIPCONFIG_CPPFLAGS) + endif + $(hip-all.c:%.c=$(OBJDIR)/%.o) $(hip-all.c:%=%.tidy): CPPFLAGS += $(HIPCONFIG_CPPFLAGS_C) ifneq ($(CXX), $(HIPCC)) - $(hip-all.cpp:%.cpp=$(OBJDIR)/%.o) $(hip-all.cpp:%=%.tidy): CPPFLAGS += $(HIPCONFIG_CPPFLAGS) + $(hip-all.cpp:%.cpp=$(OBJDIR)/%.o) $(hip-all.cpp:%=%.tidy): CPPFLAGS += $(HIPCONFIG_CPPFLAGS_C) endif - PKG_LIBS += -L$(abspath $(HIP_LIB_DIR)) -lamdhip64 -lhipblas + PKG_LIBS += -L$(abspath $(HIP_LIB_DIR)) -l${HIP_LIB_NAME} -lhipblas LIBCEED_CONTAINS_CXX = 1 libceed.c += $(hip-all.c) libceed.cpp += $(hip-all.cpp) - libceed.hip += $(hip-ref.hip) + libceed.hip += $(hip-all.hip) BACKENDS_MAKE += $(HIP_BACKENDS) endif @@ -485,7 +602,7 @@ endif ifneq ($(SYCL_LIB_DIR),) PKG_LIBS += $(SYCL_FLAG) -lze_loader LIBCEED_CONTAINS_CXX = 1 - libceed.sycl += $(sycl-core.cpp) $(sycl-ref.cpp) $(sycl-shared.cpp) $(sycl-gen.cpp) + libceed.sycl += $(sycl-core.cpp) $(sycl-ref.cpp) $(sycl-shared.cpp) $(sycl-gen.cpp) BACKENDS_MAKE += $(SYCL_BACKENDS) endif @@ -528,6 +645,11 @@ endif BACKENDS ?= $(BACKENDS_MAKE) export BACKENDS + +# ------------------------------------------------------------ +# Linker Flags +# ------------------------------------------------------------ + _pkg_ldflags = $(filter -L%,$(PKG_LIBS)) _pkg_ldlibs = $(filter-out -L%,$(PKG_LIBS)) $(libceeds) : CEED_LDFLAGS += $(_pkg_ldflags) $(if $(STATIC),,$(_pkg_ldflags:-L%=-Wl,-rpath,%)) $(PKG_STUBS_LIBS) @@ -539,13 +661,23 @@ endif pkgconfig-libs-private = $(PKG_LIBS) ifeq ($(LIBCEED_CONTAINS_CXX),1) - $(libceeds) : LINK = $(CXX) + ifneq ($(SYCL_LIB_DIR),) + $(libceeds) : LINK = $(SYCLCXX) + $(libceeds) : CEED_LDFLAGS += $(SYCLFLAGS) + else + $(libceeds) : LINK = $(CXX) + endif ifeq ($(STATIC),1) $(examples) $(tests) : CEED_LDLIBS += $(LIBCXX) pkgconfig-libs-private += $(LIBCXX) endif endif + +# ------------------------------------------------------------ +# Building core library components +# ------------------------------------------------------------ + # File names *-weak.c contain weak symbol definitions, which must be listed last # when creating shared or static libraries. weak_last = $(filter-out %-weak.o,$(1)) $(filter %-weak.o,$(1)) @@ -560,7 +692,7 @@ $(libceed.a) : $(call weak_last,$(libceed.o)) | $$(@D)/.DIR $(call quiet,AR) $(ARFLAGS) $@ $^ $(OBJDIR)/%.o : $(CURDIR)/%.c | $$(@D)/.DIR - $(call quiet,CC) $(CPPFLAGS) $(CFLAGS) -c -o $@ $(abspath $<) + $(call quiet,CC) $(CPPFLAGS) $(CFLAGS) $(CONFIGFLAGS) -c -o $@ $(abspath $<) $(OBJDIR)/%.o : $(CURDIR)/%.cpp | $$(@D)/.DIR $(call quiet,CXX) $(CPPFLAGS) $(CXXFLAGS) -c -o $@ $(abspath $<) @@ -578,7 +710,7 @@ $(OBJDIR)/%.o : $(CURDIR)/%.sycl.cpp | $$(@D)/.DIR $(call quiet,SYCLCXX) $(SYCLFLAGS) $(CPPFLAGS) -c -o $@ $(abspath $<) $(OBJDIR)/%$(EXE_SUFFIX) : tests/%.c | $$(@D)/.DIR - $(call quiet,LINK.c) $(CEED_LDFLAGS) -o $@ $(abspath $<) $(CEED_LIBS) $(CEED_LDLIBS) $(LDLIBS) + $(call quiet,LINK.c) $(CEED_LDFLAGS) -o $@ $(abspath $<) $(CEED_LIBS) $(CEED_LDLIBS) $(LDLIBS) -I./tests/test-include $(OBJDIR)/%$(EXE_SUFFIX) : tests/%.f90 | $$(@D)/.DIR $(call quiet,LINK.F) -DSOURCE_DIR='"$(abspath $( $@ +GIT_DESCRIBE = $(shell git -c safe.directory=$PWD describe --always --dirty 2>/dev/null || printf "unknown\n") + +$(OBJDIR)/interface/ceed-config.o: Makefile +$(OBJDIR)/interface/ceed-config.o: CONFIGFLAGS += -DCEED_GIT_VERSION="\"$(GIT_DESCRIBE)\"" +$(OBJDIR)/interface/ceed-config.o: CONFIGFLAGS += -DCEED_BUILD_CONFIGURATION="\"// Build Configuration:$(foreach v,$(CONFIG_VARS),\n$(v) = $($(v)))\"" + $(OBJDIR)/interface/ceed-jit-source-root-default.o : CPPFLAGS += -DCEED_JIT_SOURCE_ROOT_DEFAULT="\"$(abspath ./include)/\"" $(OBJDIR)/interface/ceed-jit-source-root-install.o : CPPFLAGS += -DCEED_JIT_SOURCE_ROOT_DEFAULT="\"$(abspath $(includedir))/\"" + +# ------------------------------------------------------------ +# Installation +# ------------------------------------------------------------ + install : $(libceed) $(OBJDIR)/ceed.pc $(INSTALL) -d $(addprefix $(if $(DESTDIR),"$(DESTDIR)"),"$(includedir)"\ "$(includedir)/ceed/" "$(includedir)/ceed/jit-source/"\ @@ -732,6 +903,7 @@ install : $(libceed) $(OBJDIR)/ceed.pc "$(includedir)/ceed/jit-source/gallery/" "$(includedir)/ceed/jit-source/magma/"\ "$(includedir)/ceed/jit-source/sycl/" "$(libdir)" "$(pkgconfigdir)") $(INSTALL_DATA) include/ceed/ceed.h "$(DESTDIR)$(includedir)/ceed/" + $(INSTALL_DATA) include/ceed/deprecated.h "$(DESTDIR)$(includedir)/ceed/" $(INSTALL_DATA) include/ceed/types.h "$(DESTDIR)$(includedir)/ceed/" $(INSTALL_DATA) include/ceed/ceed-f32.h "$(DESTDIR)$(includedir)/ceed/" $(INSTALL_DATA) include/ceed/ceed-f64.h "$(DESTDIR)$(includedir)/ceed/" @@ -749,18 +921,26 @@ install : $(libceed) $(OBJDIR)/ceed.pc $(INSTALL_DATA) $(wildcard include/ceed/jit-source/magma/*.h) "$(DESTDIR)$(includedir)/ceed/jit-source/magma/" $(INSTALL_DATA) $(wildcard include/ceed/jit-source/sycl/*.h) "$(DESTDIR)$(includedir)/ceed/jit-source/sycl/" -.PHONY : all cln clean doxygen doc format lib install par print test tst prove prv prove-all junit examples tidy iwyu info info-backends info-backends-all + +# ------------------------------------------------------------ +# Cleaning +# ------------------------------------------------------------ cln clean : $(RM) -r $(OBJDIR) $(LIBDIR) dist *egg* .pytest_cache *cffi* $(call quiet,MAKE) -C examples clean NEK5K_DIR="$(abspath $(NEK5K_DIR))" $(call quiet,MAKE) -C python/tests clean $(RM) benchmarks/*output.txt + $(RM) -rf temp distclean : clean $(RM) -r doc/html doc/sphinx/build $(CONFIG) + +# ------------------------------------------------------------ # Documentation +# ------------------------------------------------------------ + DOXYGEN ?= doxygen doxygen : @@ -771,13 +951,18 @@ doc-html doc-latexpdf doc-epub doc-livehtml : doc-% : doxygen doc : doc-html + +# ------------------------------------------------------------ +# Linting utilities +# ------------------------------------------------------------ + # Style/Format CLANG_FORMAT ?= clang-format CLANG_FORMAT_OPTS += -style=file -i AUTOPEP8 ?= autopep8 AUTOPEP8_OPTS += --in-place --aggressive --max-line-length 120 -format.ch := $(filter-out include/ceedf.h $(wildcard tests/t*-f.h), $(shell git ls-files '*.[ch]pp' '*.[ch]')) +format.ch := $(filter-out include/ceedf.h $(wildcard tests/t*-f.h) $(wildcard examples/ceed/ex*-f.h), $(shell git ls-files '*.[ch]pp' '*.[ch]' '*.cu')) format.py := $(filter-out tests/junit-xml/junit_xml/__init__.py, $(shell git ls-files '*.py')) format.ot := $(filter-out doc/sphinx/source/CODE_OF_CONDUCT.md doc/sphinx/source/CONTRIBUTING.md, $(shell git ls-files '*.md' '*.f90')) @@ -794,7 +979,7 @@ format : format-c format-py format-ot # Vermin - python version requirements VERMIN ?= vermin -VERMIN_OPTS += -t=3.7- --violations +VERMIN_OPTS += -t=3.8- --violations vermin : $(VERMIN) $(VERMIN_OPTS) $(format.py) @@ -803,10 +988,10 @@ vermin : CLANG_TIDY ?= clang-tidy %.c.tidy : %.c - $(CLANG_TIDY) $(TIDY_OPTS) $^ -- $(CPPFLAGS) --std=c99 -I$(CUDA_DIR)/include -I$(ROCM_DIR)/include -DCEED_JIT_SOURCE_ROOT_DEFAULT="\"$(abspath ./include)/\"" + $(CLANG_TIDY) $(TIDY_OPTS) $^ -- $(CPPFLAGS) --std=c11 -I$(CUDA_DIR)/include -I$(ROCM_DIR)/include -DCEED_JIT_SOURCE_ROOT_DEFAULT="\"$(abspath ./include)/\"" -DCEED_GIT_VERSION="\"$(GIT_DESCRIBE)\"" -DCEED_BUILD_CONFIGURATION="\"// Build Configuration:$(foreach v,$(CONFIG_VARS),\n$(v) = $($(v)))\"" %.cpp.tidy : %.cpp - $(CLANG_TIDY) $(TIDY_OPTS) $^ -- $(CPPFLAGS) --std=c++11 -I$(CUDA_DIR)/include -I$(OCCA_DIR)/include -I$(ROCM_DIR)/include + $(CLANG_TIDY) $(TIDY_OPTS) $^ -- $(CPPFLAGS) --std=c++11 -I$(CUDA_DIR)/include -I$(ROCM_DIR)/include tidy-c : $(libceed.c:%=%.tidy) tidy-cpp : $(libceed.cpp:%=%.tidy) @@ -821,6 +1006,11 @@ endif iwyu : $(MAKE) -B CC=$(IWYU_CC) + +# ------------------------------------------------------------ +# Variable printing for debugging +# ------------------------------------------------------------ + print : @echo $(VAR)=$($(VAR)) @@ -833,6 +1023,11 @@ print-% : $(info ) @true + +# ------------------------------------------------------------ +# Configuration caching +# ------------------------------------------------------------ + # "make configure" detects any variables passed on the command line or # previously set in config.mk, caching them in config.mk as simple # (:=) variables. Variables set in config.mk or on the command line @@ -852,7 +1047,7 @@ print-% : CONFIG_VARS = CC CXX FC NVCC NVCC_CXX HIPCC \ OPT CFLAGS CPPFLAGS CXXFLAGS FFLAGS NVCCFLAGS HIPCCFLAGS SYCLFLAGS \ AR ARFLAGS LDFLAGS LDLIBS LIBCXX SED \ - MAGMA_DIR OCCA_DIR XSMM_DIR CUDA_DIR CUDA_ARCH MFEM_DIR PETSC_DIR NEK5K_DIR ROCM_DIR HIP_ARCH SYCL_DIR SMARTREDIS_DIR + MAGMA_DIR XSMM_DIR CUDA_DIR CUDA_ARCH MFEM_DIR PETSC_DIR NEK5K_DIR ROCM_DIR HIP_ARCH SYCL_DIR # $(call needs_save,CFLAGS) returns true (a nonempty string) if CFLAGS # was set on the command line or in config.mk (where it will appear as @@ -865,6 +1060,11 @@ configure : @echo "Configuration cached in $(CONFIG):" @cat $(CONFIG) + +# ------------------------------------------------------------ +# Building Python wheels for deployment +# ------------------------------------------------------------ + wheel : export MARCHFLAG = -march=generic wheel : export WHEEL_PLAT = manylinux2010_x86_64 wheel : @@ -872,7 +1072,13 @@ wheel : -e MARCHFLAG -e WHEEL_PLAT \ quay.io/pypa/$(WHEEL_PLAT) python/make-wheels.sh -.PHONY : configure wheel +# ------------------------------------------------------------ +# Phony targets +# ------------------------------------------------------------ + +# These targets are not files but rather commands to run +.PHONY : all cln clean doxygen doc format lib install par print test tst prove prv prove-all junit examples tidy iwyu info info-backends info-backends-all configure wheel + # Include *.d deps when not -B = --always-make: useful if the paths are wonky in a container -include $(if $(filter B,$(MAKEFLAGS)),,$(libceed.c:%.c=$(OBJDIR)/%.d) $(tests.c:tests/%.c=$(OBJDIR)/%.d)) diff --git a/README.md b/README.md index 7c2d32c0ea..9725a1422c 100644 --- a/README.md +++ b/README.md @@ -27,7 +27,7 @@ The CEED research is supported by the [Exascale Computing Project](https://exasc For more details on the CEED API see the [user manual](https://libceed.org/en/latest/). -% gettingstarted-inclusion-marker + ## Building @@ -183,13 +183,6 @@ There are multiple supported backends, which can be selected at runtime in the e | `/gpu/hip/magma` | HIP MAGMA kernels | No | | `/gpu/hip/magma/det` | HIP MAGMA kernels | Yes | || -| **OCCA** | -| `/*/occa` | Selects backend based on available OCCA modes | Yes | -| `/cpu/self/occa` | OCCA backend with serial CPU kernels | Yes | -| `/cpu/openmp/occa` | OCCA backend with OpenMP kernels | Yes | -| `/cpu/dpcpp/occa` | OCCA backend with DPC++ kernels | Yes | -| `/gpu/cuda/occa` | OCCA backend with CUDA kernels | Yes | -| `/gpu/hip/occa` | OCCA backend with HIP kernels | Yes | The `/cpu/self/*/serial` backends process one element at a time and are intended for meshes with a smaller number of high order elements. The `/cpu/self/*/blocked` backends process blocked batches of eight interlaced elements and are intended for meshes with higher numbers of elements. @@ -207,6 +200,7 @@ This backend can be run in serial or blocked mode and defaults to running in the The `/cpu/self/xsmm/*` backends rely upon the [LIBXSMM](https://github.com/libxsmm/libxsmm) package to provide vectorized CPU performance. If linking MKL and LIBXSMM is desired but the Makefile is not detecting `MKLROOT`, linking libCEED against MKL can be forced by setting the environment variable `MKL=1`. +The LIBXSMM `main` development branch from 7 April 2024 or newer is required. The `/gpu/cuda/*` backends provide GPU performance strictly using CUDA. @@ -229,31 +223,24 @@ For example: > - `/gpu/cuda/gen:device_id=1` -The `/*/occa` backends rely upon the [OCCA](http://github.com/libocca/occa) package to provide cross platform performance. -To enable the OCCA backend, the environment variable `OCCA_DIR` must point to the top-level OCCA directory, with the OCCA library located in the `${OCCA_DIR}/lib` (By default, `OCCA_DIR` is set to `../occa`). -OCCA version 1.4.0 or newer is required. - -Users can pass specific OCCA device properties after setting the CEED resource. -For example: - -> - `"/*/occa:mode='CUDA',device_id=0"` - Bit-for-bit reproducibility is important in some applications. However, some libCEED backends use non-deterministic operations, such as `atomicAdd` for increased performance. The backends which are capable of generating reproducible results, with the proper compilation options, are highlighted in the list above. + + ## Examples libCEED comes with several examples of its usage, ranging from standalone C codes in the `/examples/ceed` directory to examples based on external packages, such as MFEM, PETSc, and Nek5000. Nek5000 v18.0 or greater is required. -To build the examples, set the `MFEM_DIR`, `PETSC_DIR`, and `NEK5K_DIR` variables and run: +To build the examples, set the `MFEM_DIR`, `PETSC_DIR` (and optionally `PETSC_ARCH`), and `NEK5K_DIR` variables and run: ```console $ cd examples/ ``` -% running-examples-inclusion-marker + ```console # libCEED examples on CPU and GPU @@ -336,7 +323,7 @@ The above code assumes a GPU-capable machine with the CUDA backends enabled. Depending on the available backends, other CEED resource specifiers can be provided with the `-ceed` option. Other command line arguments can be found in [examples/petsc](https://github.com/CEED/libCEED/blob/main/examples/petsc/README.md). -% benchmarks-marker + ## Benchmarks @@ -414,7 +401,22 @@ If you utilize libCEED please cite: ```bibtex @article{libceed-joss-paper, - author = {Jed Brown and Ahmad Abdelfattah and Valeria Barra and Natalie Beams and Jean Sylvain Camier and Veselin Dobrev and Yohann Dudouit and Leila Ghaffari and Tzanio Kolev and David Medina and Will Pazner and Thilina Ratnayaka and Jeremy Thompson and Stan Tomov}, + author = { + Brown, Jed and + Abdelfattah, Ahmad and + Barra, Valeria and + Beams, Natalie and + Camier, Jean-Sylvain and + Dobrev, Veselin and + Dudouit, Yohann and + Ghaffari, Leila and + Kolev, Tzanio and + Medina, David and + Pazner, Will and + Ratnayaka, Thilina and + Thompson, Jeremy L. and + Tomov, Stan + }, title = {{libCEED}: Fast algebra for high-order element-based discretizations}, journal = {Journal of Open Source Software}, year = {2021}, @@ -431,23 +433,25 @@ To cite the user manual: ```bibtex @misc{libceed-user-manual, - author = {Abdelfattah, Ahmad and - Barra, Valeria and - Beams, Natalie and - Brown, Jed and - Camier, Jean-Sylvain and - Dobrev, Veselin and - Dudouit, Yohann and - Ghaffari, Leila and - Grimberg, Sebastian and - Kolev, Tzanio and - Medina, David and - Pazner, Will and - Ratnayaka, Thilina and - Shakeri, Rezgar and - Thompson, Jeremy L and - Tomov, Stanimire and - Wright III, James}, + author = { + Abdelfattah, Ahmad and + Barra, Valeria and + Beams, Natalie and + Brown, Jed and + Camier, Jean-Sylvain and + Dobrev, Veselin and + Dudouit, Yohann and + Ghaffari, Leila and + Grimberg, Sebastian and + Kolev, Tzanio and + Medina, David and + Pazner, Will and + Ratnayaka, Thilina and + Shakeri, Rezgar and + Thompson, Jeremy L. and + Tomov, Stanimire and + Wright III, James + }, title = {{libCEED} User Manual}, month = nov, year = 2023, @@ -460,9 +464,14 @@ To cite the user manual: For libCEED's Python interface please cite: ```bibtex -@InProceedings{libceed-paper-proc-scipy-2020, - author = {{V}aleria {B}arra and {J}ed {B}rown and {J}eremy {T}hompson and {Y}ohann {D}udouit}, - title = {{H}igh-performance operator evaluations with ease of use: lib{C}{E}{E}{D}'s {P}ython interface}, +@InProceedings{libceed-scipy, + author = { + Barra, Valeria and + Brown, Jed and + Thompson, Jeremy L. and + Dudouit, Yohann + }, + title = {{H}igh-performance operator evaluations with ease of use: {libCEED}'s {P}ython interface}, booktitle = {{P}roceedings of the 19th {P}ython in {S}cience {C}onference}, pages = {85 - 90}, year = {2020}, @@ -477,7 +486,7 @@ The BibTeX entries for these references can be found in the `doc/bib/references. The following copyright applies to each file in the CEED software suite, unless otherwise stated in the file: -> Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +> Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. > All rights reserved. See files LICENSE and NOTICE for details. diff --git a/backends/avx/ceed-avx-blocked.c b/backends/avx/ceed-avx-blocked.c index c565faa653..8452fd8591 100644 --- a/backends/avx/ceed-avx-blocked.c +++ b/backends/avx/ceed-avx-blocked.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -25,6 +25,7 @@ static int CeedInit_Avx(const char *resource, Ceed ceed) { // Create reference Ceed that implementation will be dispatched through unless overridden CeedCallBackend(CeedInit("/cpu/self/opt/blocked", &ceed_ref)); CeedCallBackend(CeedSetDelegate(ceed, ceed_ref)); + CeedCallBackend(CeedDestroy(&ceed_ref)); CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "TensorContractCreate", CeedTensorContractCreate_Avx)); return CEED_ERROR_SUCCESS; diff --git a/backends/avx/ceed-avx-serial.c b/backends/avx/ceed-avx-serial.c index 5ebe28e19a..06ed5f9fdb 100644 --- a/backends/avx/ceed-avx-serial.c +++ b/backends/avx/ceed-avx-serial.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -25,6 +25,7 @@ static int CeedInit_Avx(const char *resource, Ceed ceed) { // Create reference Ceed that implementation will be dispatched through unless overridden CeedCallBackend(CeedInit("/cpu/self/opt/serial", &ceed_ref)); CeedCallBackend(CeedSetDelegate(ceed, ceed_ref)); + CeedCallBackend(CeedDestroy(&ceed_ref)); CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "TensorContractCreate", CeedTensorContractCreate_Avx)); return CEED_ERROR_SUCCESS; diff --git a/backends/avx/ceed-avx-tensor.c b/backends/avx/ceed-avx-tensor.c index cd22249e83..40d5df0646 100644 --- a/backends/avx/ceed-avx-tensor.c +++ b/backends/avx/ceed-avx-tensor.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -10,7 +10,7 @@ #include #include -#ifdef CEED_F64_H +#ifdef CEED_SCALAR_IS_FP64 #define rtype __m256d #define loadu _mm256_loadu_pd #define storeu _mm256_storeu_pd diff --git a/backends/avx/ceed-avx.h b/backends/avx/ceed-avx.h index 786be45a0d..cb151baa85 100644 --- a/backends/avx/ceed-avx.h +++ b/backends/avx/ceed-avx.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause diff --git a/backends/blocked/ceed-blocked-operator.c b/backends/blocked/ceed-blocked-operator.c index d1f2678e1d..0161c2819f 100644 --- a/backends/blocked/ceed-blocked-operator.c +++ b/backends/blocked/ceed-blocked-operator.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -16,9 +16,9 @@ //------------------------------------------------------------------------------ // Setup Input/Output Fields //------------------------------------------------------------------------------ -static int CeedOperatorSetupFields_Blocked(CeedQFunction qf, CeedOperator op, bool is_input, const CeedInt block_size, - CeedElemRestriction *block_rstr, CeedVector *e_vecs_full, CeedVector *e_vecs, CeedVector *q_vecs, - CeedInt start_e, CeedInt num_fields, CeedInt Q) { +static int CeedOperatorSetupFields_Blocked(CeedQFunction qf, CeedOperator op, bool is_input, bool *skip_rstr, CeedInt *e_data_out_indices, + bool *apply_add_basis, const CeedInt block_size, CeedElemRestriction *block_rstr, CeedVector *e_vecs_full, + CeedVector *e_vecs, CeedVector *q_vecs, CeedInt start_e, CeedInt num_fields, CeedInt Q) { Ceed ceed; CeedSize e_size, q_size; CeedInt num_comp, size, P; @@ -30,7 +30,8 @@ static int CeedOperatorSetupFields_Blocked(CeedQFunction qf, CeedOperator op, bo CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); CeedCallBackend(CeedGetParent(ceed, &ceed_parent)); - if (ceed_parent) ceed = ceed_parent; + CeedCallBackend(CeedReferenceCopy(ceed_parent, &ceed)); + CeedCallBackend(CeedDestroy(&ceed_parent)); } if (is_input) { CeedCallBackend(CeedOperatorGetFields(op, NULL, &op_fields, NULL, NULL)); @@ -101,10 +102,14 @@ static int CeedOperatorSetupFields_Blocked(CeedQFunction qf, CeedOperator op, bo CeedCallBackend(CeedElemRestrictionCreateBlockedStrided(ceed_rstr, num_elem, elem_size, block_size, num_comp, l_size, strides, &block_rstr[i + start_e])); } break; + // LCOV_EXCL_START case CEED_RESTRICTION_POINTS: // Empty case - won't occur break; + // LCOV_EXCL_STOP } + CeedCallBackend(CeedDestroy(&ceed_rstr)); + CeedCallBackend(CeedElemRestrictionDestroy(&rstr)); CeedCallBackend(CeedElemRestrictionCreateVector(block_rstr[i + start_e], NULL, &e_vecs_full[i + start_e])); } @@ -122,6 +127,7 @@ static int CeedOperatorSetupFields_Blocked(CeedQFunction qf, CeedOperator op, bo CeedCallBackend(CeedQFunctionFieldGetSize(qf_fields[i], &size)); CeedCallBackend(CeedBasisGetNumNodes(basis, &P)); CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); + CeedCallBackend(CeedBasisDestroy(&basis)); e_size = (CeedSize)P * num_comp * block_size; CeedCallBackend(CeedVectorCreate(ceed, e_size, &e_vecs[i])); q_size = (CeedSize)Q * size * block_size; @@ -132,9 +138,63 @@ static int CeedOperatorSetupFields_Blocked(CeedQFunction qf, CeedOperator op, bo q_size = (CeedSize)Q * block_size; CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i])); CeedCallBackend(CeedBasisApply(basis, block_size, CEED_NOTRANSPOSE, CEED_EVAL_WEIGHT, CEED_VECTOR_NONE, q_vecs[i])); + CeedCallBackend(CeedBasisDestroy(&basis)); break; } } + // Drop duplicate restrictions + if (is_input) { + for (CeedInt i = 0; i < num_fields; i++) { + CeedVector vec_i; + CeedElemRestriction rstr_i; + + CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec_i)); + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &rstr_i)); + for (CeedInt j = i + 1; j < num_fields; j++) { + CeedVector vec_j; + CeedElemRestriction rstr_j; + + CeedCallBackend(CeedOperatorFieldGetVector(op_fields[j], &vec_j)); + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[j], &rstr_j)); + if (vec_i == vec_j && rstr_i == rstr_j) { + CeedCallBackend(CeedVectorReferenceCopy(e_vecs[i], &e_vecs[j])); + CeedCallBackend(CeedVectorReferenceCopy(e_vecs_full[i + start_e], &e_vecs_full[j + start_e])); + skip_rstr[j] = true; + } + CeedCallBackend(CeedVectorDestroy(&vec_j)); + CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j)); + } + CeedCallBackend(CeedVectorDestroy(&vec_i)); + CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i)); + } + } else { + for (CeedInt i = num_fields - 1; i >= 0; i--) { + CeedVector vec_i; + CeedElemRestriction rstr_i; + + CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec_i)); + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &rstr_i)); + for (CeedInt j = i - 1; j >= 0; j--) { + CeedVector vec_j; + CeedElemRestriction rstr_j; + + CeedCallBackend(CeedOperatorFieldGetVector(op_fields[j], &vec_j)); + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[j], &rstr_j)); + if (vec_i == vec_j && rstr_i == rstr_j) { + CeedCallBackend(CeedVectorReferenceCopy(e_vecs[i], &e_vecs[j])); + CeedCallBackend(CeedVectorReferenceCopy(e_vecs_full[i + start_e], &e_vecs_full[j + start_e])); + skip_rstr[j] = true; + apply_add_basis[i] = true; + e_data_out_indices[j] = i; + } + CeedCallBackend(CeedVectorDestroy(&vec_j)); + CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j)); + } + CeedCallBackend(CeedVectorDestroy(&vec_i)); + CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i)); + } + } + CeedCallBackend(CeedDestroy(&ceed)); return CEED_ERROR_SUCCESS; } @@ -143,7 +203,6 @@ static int CeedOperatorSetupFields_Blocked(CeedQFunction qf, CeedOperator op, bo //------------------------------------------------------------------------------ static int CeedOperatorSetup_Blocked(CeedOperator op) { bool is_setup_done; - Ceed ceed; CeedInt Q, num_input_fields, num_output_fields; const CeedInt block_size = 8; CeedQFunctionField *qf_input_fields, *qf_output_fields; @@ -154,7 +213,6 @@ static int CeedOperatorSetup_Blocked(CeedOperator op) { CeedCallBackend(CeedOperatorIsSetupDone(op, &is_setup_done)); if (is_setup_done) return CEED_ERROR_SUCCESS; - CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); CeedCallBackend(CeedOperatorGetData(op, &impl)); CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q)); @@ -166,6 +224,10 @@ static int CeedOperatorSetup_Blocked(CeedOperator op) { CeedCallBackend(CeedCalloc(num_input_fields + num_output_fields, &impl->block_rstr)); CeedCallBackend(CeedCalloc(num_input_fields + num_output_fields, &impl->e_vecs_full)); + CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->skip_rstr_in)); + CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->skip_rstr_out)); + CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->e_data_out_indices)); + CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->apply_add_basis_out)); CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->input_states)); CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->e_vecs_in)); CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->e_vecs_out)); @@ -177,11 +239,12 @@ static int CeedOperatorSetup_Blocked(CeedOperator op) { // Set up infield and outfield pointer arrays // Infields - CeedCallBackend(CeedOperatorSetupFields_Blocked(qf, op, true, block_size, impl->block_rstr, impl->e_vecs_full, impl->e_vecs_in, impl->q_vecs_in, 0, - num_input_fields, Q)); + CeedCallBackend(CeedOperatorSetupFields_Blocked(qf, op, true, impl->skip_rstr_in, NULL, NULL, block_size, impl->block_rstr, impl->e_vecs_full, + impl->e_vecs_in, impl->q_vecs_in, 0, num_input_fields, Q)); // Outfields - CeedCallBackend(CeedOperatorSetupFields_Blocked(qf, op, false, block_size, impl->block_rstr, impl->e_vecs_full, impl->e_vecs_out, impl->q_vecs_out, - num_input_fields, num_output_fields, Q)); + CeedCallBackend(CeedOperatorSetupFields_Blocked(qf, op, false, impl->skip_rstr_out, impl->e_data_out_indices, impl->apply_add_basis_out, block_size, + impl->block_rstr, impl->e_vecs_full, impl->e_vecs_out, impl->q_vecs_out, num_input_fields, + num_output_fields, Q)); // Identity QFunctions if (impl->is_identity_qf) { @@ -200,6 +263,7 @@ static int CeedOperatorSetup_Blocked(CeedOperator op) { } CeedCallBackend(CeedOperatorSetSetupDone(op)); + CeedCallBackend(CeedQFunctionDestroy(&qf)); return CEED_ERROR_SUCCESS; } @@ -210,13 +274,15 @@ static inline int CeedOperatorSetupInputs_Blocked(CeedInt num_input_fields, Ceed CeedVector in_vec, bool skip_active, CeedScalar *e_data_full[2 * CEED_FIELD_MAX], CeedOperator_Blocked *impl, CeedRequest *request) { for (CeedInt i = 0; i < num_input_fields; i++) { + bool is_active; uint64_t state; CeedEvalMode eval_mode; CeedVector vec; // Get input vector CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); - if (vec == CEED_VECTOR_ACTIVE) { + is_active = vec == CEED_VECTOR_ACTIVE; + if (is_active) { if (skip_active) continue; else vec = in_vec; } @@ -226,13 +292,14 @@ static inline int CeedOperatorSetupInputs_Blocked(CeedInt num_input_fields, Ceed } else { // Restrict CeedCallBackend(CeedVectorGetState(vec, &state)); - if (state != impl->input_states[i] || vec == in_vec) { + if ((state != impl->input_states[i] || vec == in_vec) && !impl->skip_rstr_in[i]) { CeedCallBackend(CeedElemRestrictionApply(impl->block_rstr[i], CEED_NOTRANSPOSE, vec, impl->e_vecs_full[i], request)); - impl->input_states[i] = state; } + impl->input_states[i] = state; // Get evec CeedCallBackend(CeedVectorGetArrayRead(impl->e_vecs_full[i], CEED_MEM_HOST, (const CeedScalar **)&e_data_full[i])); } + if (!is_active) CeedCallBackend(CeedVectorDestroy(&vec)); } return CEED_ERROR_SUCCESS; } @@ -251,15 +318,19 @@ static inline int CeedOperatorInputBasis_Blocked(CeedInt e, CeedInt Q, CeedQFunc // Skip active input if (skip_active) { + bool is_active; CeedVector vec; CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); - if (vec == CEED_VECTOR_ACTIVE) continue; + is_active = vec == CEED_VECTOR_ACTIVE; + CeedCallBackend(CeedVectorDestroy(&vec)); + if (is_active) continue; } // Get elem_size, eval_mode, size CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr)); CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); + CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &size)); // Basis action @@ -275,6 +346,7 @@ static inline int CeedOperatorInputBasis_Blocked(CeedInt e, CeedInt Q, CeedQFunc CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); CeedCallBackend(CeedVectorSetArray(impl->e_vecs_in[i], CEED_MEM_HOST, CEED_USE_POINTER, &e_data_full[i][(CeedSize)e * elem_size * num_comp])); CeedCallBackend(CeedBasisApply(basis, block_size, CEED_NOTRANSPOSE, eval_mode, impl->e_vecs_in[i], impl->q_vecs_in[i])); + CeedCallBackend(CeedBasisDestroy(&basis)); break; case CEED_EVAL_WEIGHT: break; // No action @@ -287,8 +359,8 @@ static inline int CeedOperatorInputBasis_Blocked(CeedInt e, CeedInt Q, CeedQFunc // Output Basis Action //------------------------------------------------------------------------------ static inline int CeedOperatorOutputBasis_Blocked(CeedInt e, CeedInt Q, CeedQFunctionField *qf_output_fields, CeedOperatorField *op_output_fields, - CeedInt block_size, CeedInt num_input_fields, CeedInt num_output_fields, CeedOperator op, - CeedScalar *e_data_full[2 * CEED_FIELD_MAX], CeedOperator_Blocked *impl) { + CeedInt block_size, CeedInt num_input_fields, CeedInt num_output_fields, bool *apply_add_basis, + CeedOperator op, CeedScalar *e_data_full[2 * CEED_FIELD_MAX], CeedOperator_Blocked *impl) { for (CeedInt i = 0; i < num_output_fields; i++) { CeedInt elem_size, num_comp; CeedEvalMode eval_mode; @@ -298,6 +370,7 @@ static inline int CeedOperatorOutputBasis_Blocked(CeedInt e, CeedInt Q, CeedQFun // Get elem_size, eval_mode, size CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr)); CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); + CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); // Basis action switch (eval_mode) { @@ -311,7 +384,12 @@ static inline int CeedOperatorOutputBasis_Blocked(CeedInt e, CeedInt Q, CeedQFun CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); CeedCallBackend(CeedVectorSetArray(impl->e_vecs_out[i], CEED_MEM_HOST, CEED_USE_POINTER, &e_data_full[i + num_input_fields][(CeedSize)e * elem_size * num_comp])); - CeedCallBackend(CeedBasisApply(basis, block_size, CEED_TRANSPOSE, eval_mode, impl->q_vecs_out[i], impl->e_vecs_out[i])); + if (apply_add_basis[i]) { + CeedCallBackend(CeedBasisApplyAdd(basis, block_size, CEED_TRANSPOSE, eval_mode, impl->q_vecs_out[i], impl->e_vecs_out[i])); + } else { + CeedCallBackend(CeedBasisApply(basis, block_size, CEED_TRANSPOSE, eval_mode, impl->q_vecs_out[i], impl->e_vecs_out[i])); + } + CeedCallBackend(CeedBasisDestroy(&basis)); break; // LCOV_EXCL_START case CEED_EVAL_WEIGHT: { @@ -333,10 +411,13 @@ static inline int CeedOperatorRestoreInputs_Blocked(CeedInt num_input_fields, Ce // Skip active inputs if (skip_active) { + bool is_active; CeedVector vec; CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); - if (vec == CEED_VECTOR_ACTIVE) continue; + is_active = vec == CEED_VECTOR_ACTIVE; + CeedCallBackend(CeedVectorDestroy(&vec)); + if (is_active) continue; } CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); if (eval_mode == CEED_EVAL_WEIGHT) { // Skip @@ -360,30 +441,34 @@ static int CeedOperatorApplyAdd_Blocked(CeedOperator op, CeedVector in_vec, Ceed CeedOperatorField *op_input_fields, *op_output_fields; CeedOperator_Blocked *impl; - CeedCallBackend(CeedOperatorGetData(op, &impl)); - CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem)); - CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q)); - CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); - CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); - CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); - const CeedInt num_blocks = (num_elem / block_size) + !!(num_elem % block_size); - // Setup CeedCallBackend(CeedOperatorSetup_Blocked(op)); + CeedCallBackend(CeedOperatorGetData(op, &impl)); + // Restriction only operator if (impl->is_identity_rstr_op) { CeedCallBackend(CeedElemRestrictionApply(impl->block_rstr[0], CEED_NOTRANSPOSE, in_vec, impl->e_vecs_full[0], request)); CeedCallBackend(CeedElemRestrictionApply(impl->block_rstr[1], CEED_TRANSPOSE, impl->e_vecs_full[0], out_vec, request)); return CEED_ERROR_SUCCESS; } + CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem)); + CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q)); + CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); + CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); + CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); + const CeedInt num_blocks = (num_elem / block_size) + !!(num_elem % block_size); // Input Evecs and Restriction CeedCallBackend(CeedOperatorSetupInputs_Blocked(num_input_fields, qf_input_fields, op_input_fields, in_vec, false, e_data_full, impl, request)); // Output Evecs - for (CeedInt i = 0; i < num_output_fields; i++) { - CeedCallBackend(CeedVectorGetArrayWrite(impl->e_vecs_full[i + impl->num_inputs], CEED_MEM_HOST, &e_data_full[i + num_input_fields])); + for (CeedInt i = num_output_fields - 1; i >= 0; i--) { + if (impl->skip_rstr_out[i]) { + e_data_full[i + num_input_fields] = e_data_full[impl->e_data_out_indices[i] + num_input_fields]; + } else { + CeedCallBackend(CeedVectorGetArrayWrite(impl->e_vecs_full[i + impl->num_inputs], CEED_MEM_HOST, &e_data_full[i + num_input_fields])); + } } // Loop through elements @@ -393,8 +478,8 @@ static int CeedOperatorApplyAdd_Blocked(CeedOperator op, CeedVector in_vec, Ceed CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); if (eval_mode == CEED_EVAL_NONE) { CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &size)); - CeedCallBackend( - CeedVectorSetArray(impl->q_vecs_out[i], CEED_MEM_HOST, CEED_USE_POINTER, &e_data_full[i + num_input_fields][(CeedSize)e * Q * size])); + CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[i], CEED_MEM_HOST, CEED_USE_POINTER, + &e_data_full[i + num_input_fields][(CeedSize)e * Q * size])); } } @@ -407,27 +492,32 @@ static int CeedOperatorApplyAdd_Blocked(CeedOperator op, CeedVector in_vec, Ceed } // Output basis apply - CeedCallBackend(CeedOperatorOutputBasis_Blocked(e, Q, qf_output_fields, op_output_fields, block_size, num_input_fields, num_output_fields, op, - e_data_full, impl)); + CeedCallBackend(CeedOperatorOutputBasis_Blocked(e, Q, qf_output_fields, op_output_fields, block_size, num_input_fields, num_output_fields, + impl->apply_add_basis_out, op, e_data_full, impl)); } // Output restriction for (CeedInt i = 0; i < num_output_fields; i++) { + bool is_active; CeedVector vec; + if (impl->skip_rstr_out[i]) continue; // Restore evec CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs_full[i + impl->num_inputs], &e_data_full[i + num_input_fields])); // Get output vector CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec)); + is_active = vec == CEED_VECTOR_ACTIVE; // Active - if (vec == CEED_VECTOR_ACTIVE) vec = out_vec; + if (is_active) vec = out_vec; // Restrict - CeedCallBackend( - CeedElemRestrictionApply(impl->block_rstr[i + impl->num_inputs], CEED_TRANSPOSE, impl->e_vecs_full[i + impl->num_inputs], vec, request)); + CeedCallBackend(CeedElemRestrictionApply(impl->block_rstr[i + impl->num_inputs], CEED_TRANSPOSE, impl->e_vecs_full[i + impl->num_inputs], vec, + request)); + if (!is_active) CeedCallBackend(CeedVectorDestroy(&vec)); } // Restore input arrays CeedCallBackend(CeedOperatorRestoreInputs_Blocked(num_input_fields, qf_input_fields, op_input_fields, false, e_data_full, impl)); + CeedCallBackend(CeedQFunctionDestroy(&qf)); return CEED_ERROR_SUCCESS; } @@ -437,8 +527,7 @@ static int CeedOperatorApplyAdd_Blocked(CeedOperator op, CeedVector in_vec, Ceed static inline int CeedOperatorLinearAssembleQFunctionCore_Blocked(CeedOperator op, bool build_objects, CeedVector *assembled, CeedElemRestriction *rstr, CeedRequest *request) { Ceed ceed; - CeedSize q_size; - CeedInt Q, num_input_fields, num_output_fields, num_elem, size; + CeedInt qf_size_in, qf_size_out, Q, num_input_fields, num_output_fields, num_elem; const CeedInt block_size = 8; CeedScalar *l_vec_array; CeedScalar *e_data_full[2 * CEED_FIELD_MAX] = {0}; @@ -448,8 +537,8 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Blocked(CeedOperator o CeedOperator_Blocked *impl; CeedCallBackend(CeedOperatorGetData(op, &impl)); - CeedInt num_active_in = impl->num_active_in, num_active_out = impl->num_active_out; - CeedVector *active_in = impl->qf_active_in; + qf_size_in = impl->qf_size_in; + qf_size_out = impl->qf_size_out; CeedVector l_vec = impl->qf_l_vec; CeedElemRestriction block_rstr = impl->qf_block_rstr; @@ -471,55 +560,45 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Blocked(CeedOperator o CeedCallBackend(CeedOperatorSetupInputs_Blocked(num_input_fields, qf_input_fields, op_input_fields, NULL, true, e_data_full, impl, request)); // Count number of active input fields - if (!num_active_in) { + if (qf_size_in == 0) { for (CeedInt i = 0; i < num_input_fields; i++) { - CeedScalar *q_vec_array; - CeedVector vec; + CeedInt field_size; + CeedVector vec; - // Get input vector - CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); // Check if active input + CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); if (vec == CEED_VECTOR_ACTIVE) { - CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &size)); + CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &field_size)); CeedCallBackend(CeedVectorSetValue(impl->q_vecs_in[i], 0.0)); - CeedCallBackend(CeedVectorGetArray(impl->q_vecs_in[i], CEED_MEM_HOST, &q_vec_array)); - CeedCallBackend(CeedRealloc(num_active_in + size, &active_in)); - for (CeedInt field = 0; field < size; field++) { - q_size = (CeedSize)Q * block_size; - CeedCallBackend(CeedVectorCreate(ceed, q_size, &active_in[num_active_in + field])); - CeedCallBackend( - CeedVectorSetArray(active_in[num_active_in + field], CEED_MEM_HOST, CEED_USE_POINTER, &q_vec_array[field * Q * block_size])); - } - num_active_in += size; - CeedCallBackend(CeedVectorRestoreArray(impl->q_vecs_in[i], &q_vec_array)); + qf_size_in += field_size; } + CeedCallBackend(CeedVectorDestroy(&vec)); } - impl->num_active_in = num_active_in; - impl->qf_active_in = active_in; + CeedCheck(qf_size_in > 0, ceed, CEED_ERROR_BACKEND, "Cannot assemble QFunction without active inputs and outputs"); + impl->qf_size_in = qf_size_in; } // Count number of active output fields - if (!num_active_out) { + if (qf_size_out == 0) { for (CeedInt i = 0; i < num_output_fields; i++) { + CeedInt field_size; CeedVector vec; - // Get output vector - CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec)); // Check if active output + CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec)); if (vec == CEED_VECTOR_ACTIVE) { - CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &size)); - num_active_out += size; + CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &field_size)); + qf_size_out += field_size; } + CeedCallBackend(CeedVectorDestroy(&vec)); } - impl->num_active_out = num_active_out; + CeedCheck(qf_size_out > 0, ceed, CEED_ERROR_BACKEND, "Cannot assemble QFunction without active inputs and outputs"); + impl->qf_size_out = qf_size_out; } - // Check sizes - CeedCheck(num_active_in > 0 && num_active_out > 0, ceed, CEED_ERROR_BACKEND, "Cannot assemble QFunction without active inputs and outputs"); - // Setup Lvec if (!l_vec) { - const CeedSize l_size = (CeedSize)num_blocks * block_size * Q * num_active_in * num_active_out; + const CeedSize l_size = (CeedSize)num_blocks * block_size * Q * qf_size_in * qf_size_out; CeedCallBackend(CeedVectorCreate(ceed, l_size, &l_vec)); impl->qf_l_vec = l_vec; @@ -528,21 +607,21 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Blocked(CeedOperator o // Setup block restriction if (!block_rstr) { - const CeedInt strides[3] = {1, Q, num_active_in * num_active_out * Q}; + const CeedInt strides[3] = {1, Q, qf_size_in * qf_size_out * Q}; - CeedCallBackend(CeedElemRestrictionCreateBlockedStrided(ceed, num_elem, Q, block_size, num_active_in * num_active_out, - num_active_in * num_active_out * num_elem * Q, strides, &block_rstr)); + CeedCallBackend(CeedElemRestrictionCreateBlockedStrided(ceed, num_elem, Q, block_size, qf_size_in * qf_size_out, + qf_size_in * qf_size_out * num_elem * Q, strides, &block_rstr)); impl->qf_block_rstr = block_rstr; } // Build objects if needed if (build_objects) { - const CeedSize l_size = (CeedSize)num_elem * Q * num_active_in * num_active_out; - const CeedInt strides[3] = {1, Q, num_active_in * num_active_out * Q}; + const CeedSize l_size = (CeedSize)num_elem * Q * qf_size_in * qf_size_out; + const CeedInt strides[3] = {1, Q, qf_size_in * qf_size_out * Q}; // Create output restriction - CeedCallBackend(CeedElemRestrictionCreateStrided(ceed, num_elem, Q, num_active_in * num_active_out, num_active_in * num_active_out * num_elem * Q, - strides, rstr)); + CeedCallBackend(CeedElemRestrictionCreateStrided(ceed, num_elem, Q, qf_size_in * qf_size_out, + (CeedSize)qf_size_in * (CeedSize)qf_size_out * (CeedSize)num_elem * (CeedSize)Q, strides, rstr)); // Create assembled vector CeedCallBackend(CeedVectorCreate(ceed, l_size, assembled)); } @@ -553,37 +632,64 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Blocked(CeedOperator o CeedCallBackend(CeedOperatorInputBasis_Blocked(e, Q, qf_input_fields, op_input_fields, num_input_fields, block_size, true, e_data_full, impl)); // Assemble QFunction - for (CeedInt in = 0; in < num_active_in; in++) { - // Set Inputs - CeedCallBackend(CeedVectorSetValue(active_in[in], 1.0)); - if (num_active_in > 1) { - CeedCallBackend(CeedVectorSetValue(active_in[(in + num_active_in - 1) % num_active_in], 0.0)); - } - if (!impl->is_identity_qf) { - // Set Outputs - for (CeedInt out = 0; out < num_output_fields; out++) { - CeedVector vec; - - // Get output vector - CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[out], &vec)); - // Check if active output - if (vec == CEED_VECTOR_ACTIVE) { - CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[out], CEED_MEM_HOST, CEED_USE_POINTER, l_vec_array)); - CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[out], &size)); - l_vec_array += size * Q * block_size; // Advance the pointer by the size of the output + for (CeedInt i = 0; i < num_input_fields; i++) { + bool is_active; + CeedInt field_size; + CeedVector vec; + + // Check if active input + CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); + is_active = vec == CEED_VECTOR_ACTIVE; + CeedCallBackend(CeedVectorDestroy(&vec)); + if (!is_active) continue; + CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &field_size)); + for (CeedInt field = 0; field < field_size; field++) { + // Set current portion of input to 1.0 + { + CeedScalar *array; + + CeedCallBackend(CeedVectorGetArray(impl->q_vecs_in[i], CEED_MEM_HOST, &array)); + for (CeedInt j = 0; j < Q * block_size; j++) array[field * Q * block_size + j] = 1.0; + CeedCallBackend(CeedVectorRestoreArray(impl->q_vecs_in[i], &array)); + } + + if (!impl->is_identity_qf) { + // Set Outputs + for (CeedInt out = 0; out < num_output_fields; out++) { + CeedInt field_size; + CeedVector vec; + + // Get output vector + CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[out], &vec)); + // Check if active output + if (vec == CEED_VECTOR_ACTIVE) { + CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[out], CEED_MEM_HOST, CEED_USE_POINTER, l_vec_array)); + CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[out], &field_size)); + l_vec_array += field_size * Q * block_size; // Advance the pointer by the size of the output + } + CeedCallBackend(CeedVectorDestroy(&vec)); } + // Apply QFunction + CeedCallBackend(CeedQFunctionApply(qf, Q * block_size, impl->q_vecs_in, impl->q_vecs_out)); + } else { + CeedInt field_size; + const CeedScalar *array; + + // Copy Identity Outputs + CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[0], &field_size)); + CeedCallBackend(CeedVectorGetArrayRead(impl->q_vecs_out[0], CEED_MEM_HOST, &array)); + for (CeedInt j = 0; j < field_size * Q * block_size; j++) l_vec_array[j] = array[j]; + CeedCallBackend(CeedVectorRestoreArrayRead(impl->q_vecs_out[0], &array)); + l_vec_array += field_size * Q * block_size; + } + // Reset input to 0.0 + { + CeedScalar *array; + + CeedCallBackend(CeedVectorGetArray(impl->q_vecs_in[i], CEED_MEM_HOST, &array)); + for (CeedInt j = 0; j < Q * block_size; j++) array[field * Q * block_size + j] = 0.0; + CeedCallBackend(CeedVectorRestoreArray(impl->q_vecs_in[i], &array)); } - // Apply QFunction - CeedCallBackend(CeedQFunctionApply(qf, Q * block_size, impl->q_vecs_in, impl->q_vecs_out)); - } else { - const CeedScalar *q_vec_array; - - // Copy Identity Outputs - CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[0], &size)); - CeedCallBackend(CeedVectorGetArrayRead(impl->q_vecs_out[0], CEED_MEM_HOST, &q_vec_array)); - for (CeedInt i = 0; i < size * Q * block_size; i++) l_vec_array[i] = q_vec_array[i]; - CeedCallBackend(CeedVectorRestoreArrayRead(impl->q_vecs_out[0], &q_vec_array)); - l_vec_array += size * Q * block_size; } } } @@ -593,12 +699,12 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Blocked(CeedOperator o for (CeedInt out = 0; out < num_output_fields; out++) { CeedVector vec; - // Get output vector - CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[out], &vec)); // Check if active output + CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[out], &vec)); if (vec == CEED_VECTOR_ACTIVE) { CeedCallBackend(CeedVectorTakeArray(impl->q_vecs_out[out], CEED_MEM_HOST, NULL)); } + CeedCallBackend(CeedVectorDestroy(&vec)); } } @@ -609,6 +715,8 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Blocked(CeedOperator o CeedCallBackend(CeedVectorRestoreArray(l_vec, &l_vec_array)); CeedCallBackend(CeedVectorSetValue(*assembled, 0.0)); CeedCallBackend(CeedElemRestrictionApply(block_rstr, CEED_TRANSPOSE, l_vec, *assembled, request)); + CeedCallBackend(CeedDestroy(&ceed)); + CeedCallBackend(CeedQFunctionDestroy(&qf)); return CEED_ERROR_SUCCESS; } @@ -634,6 +742,10 @@ static int CeedOperatorDestroy_Blocked(CeedOperator op) { CeedCallBackend(CeedOperatorGetData(op, &impl)); + CeedCallBackend(CeedFree(&impl->skip_rstr_in)); + CeedCallBackend(CeedFree(&impl->skip_rstr_out)); + CeedCallBackend(CeedFree(&impl->e_data_out_indices)); + CeedCallBackend(CeedFree(&impl->apply_add_basis_out)); for (CeedInt i = 0; i < impl->num_inputs + impl->num_outputs; i++) { CeedCallBackend(CeedElemRestrictionDestroy(&impl->block_rstr[i])); CeedCallBackend(CeedVectorDestroy(&impl->e_vecs_full[i])); @@ -657,10 +769,6 @@ static int CeedOperatorDestroy_Blocked(CeedOperator op) { CeedCallBackend(CeedFree(&impl->q_vecs_out)); // QFunction assembly data - for (CeedInt i = 0; i < impl->num_active_in; i++) { - CeedCallBackend(CeedVectorDestroy(&impl->qf_active_in[i])); - } - CeedCallBackend(CeedFree(&impl->qf_active_in)); CeedCallBackend(CeedVectorDestroy(&impl->qf_l_vec)); CeedCallBackend(CeedElemRestrictionDestroy(&impl->qf_block_rstr)); @@ -682,6 +790,7 @@ int CeedOperatorCreate_Blocked(CeedOperator op) { CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunctionUpdate", CeedOperatorLinearAssembleQFunctionUpdate_Blocked)); CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "ApplyAdd", CeedOperatorApplyAdd_Blocked)); CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "Destroy", CeedOperatorDestroy_Blocked)); + CeedCallBackend(CeedDestroy(&ceed)); return CEED_ERROR_SUCCESS; } diff --git a/backends/blocked/ceed-blocked.c b/backends/blocked/ceed-blocked.c index f37338f0d6..f50d2fc91e 100644 --- a/backends/blocked/ceed-blocked.c +++ b/backends/blocked/ceed-blocked.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -25,6 +25,7 @@ static int CeedInit_Blocked(const char *resource, Ceed ceed) { // Create reference Ceed that implementation will be dispatched through unless overridden CeedCallBackend(CeedInit("/cpu/self/ref/serial", &ceed_ref)); CeedCallBackend(CeedSetDelegate(ceed, ceed_ref)); + CeedCallBackend(CeedDestroy(&ceed_ref)); CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "OperatorCreate", CeedOperatorCreate_Blocked)); return CEED_ERROR_SUCCESS; diff --git a/backends/blocked/ceed-blocked.h b/backends/blocked/ceed-blocked.h index 917f4eb604..e1976d6e43 100644 --- a/backends/blocked/ceed-blocked.h +++ b/backends/blocked/ceed-blocked.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -17,16 +17,17 @@ typedef struct { typedef struct { bool is_identity_qf, is_identity_rstr_op; - CeedElemRestriction *block_rstr; /* Blocked versions of restrictions */ - CeedVector *e_vecs_full; /* Full E-vectors, inputs followed by outputs */ + bool *skip_rstr_in, *skip_rstr_out, *apply_add_basis_out; + CeedInt *e_data_out_indices; uint64_t *input_states; /* State counter of inputs */ + CeedVector *e_vecs_full; /* Full E-vectors, inputs followed by outputs */ CeedVector *e_vecs_in; /* Element block input E-vectors */ CeedVector *e_vecs_out; /* Element block output E-vectors */ CeedVector *q_vecs_in; /* Element block input Q-vectors */ CeedVector *q_vecs_out; /* Element block output Q-vectors */ + CeedElemRestriction *block_rstr; /* Blocked versions of restrictions */ CeedInt num_inputs, num_outputs; - CeedInt num_active_in, num_active_out; - CeedVector *qf_active_in; + CeedInt qf_size_in, qf_size_out; CeedVector qf_l_vec; CeedElemRestriction qf_block_rstr; } CeedOperator_Blocked; diff --git a/backends/ceed-backend-list-avx.h b/backends/ceed-backend-list-avx.h new file mode 100644 index 0000000000..5e19a016c7 --- /dev/null +++ b/backends/ceed-backend-list-avx.h @@ -0,0 +1,13 @@ +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. +// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +// +// SPDX-License-Identifier: BSD-2-Clause +// +// This file is part of CEED: http://github.com/ceed + +// This header does not have guards because it is included multiple times. +// This will be expanded inside CeedRegisterAll() to call each registration function. +// This is also used to create weakly linked registration functions in `backends/weak/ceed-*-weak.c'. + +CEED_BACKEND(CeedRegister_Avx_Blocked, 1, "/cpu/self/avx/blocked") +CEED_BACKEND(CeedRegister_Avx_Serial, 1, "/cpu/self/avx/serial") diff --git a/backends/ceed-backend-list-cuda.h b/backends/ceed-backend-list-cuda.h new file mode 100644 index 0000000000..87593f5b08 --- /dev/null +++ b/backends/ceed-backend-list-cuda.h @@ -0,0 +1,14 @@ +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. +// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +// +// SPDX-License-Identifier: BSD-2-Clause +// +// This file is part of CEED: http://github.com/ceed + +// This header does not have guards because it is included multiple times. +// This will be expanded inside CeedRegisterAll() to call each registration function. +// This is also used to create weakly linked registration functions in `backends/weak/ceed-*-weak.c'. + +CEED_BACKEND(CeedRegister_Cuda, 1, "/gpu/cuda/ref") +CEED_BACKEND(CeedRegister_Cuda_Gen, 1, "/gpu/cuda/gen") +CEED_BACKEND(CeedRegister_Cuda_Shared, 1, "/gpu/cuda/shared") diff --git a/backends/ceed-backend-list-hip.h b/backends/ceed-backend-list-hip.h new file mode 100644 index 0000000000..e66fc98298 --- /dev/null +++ b/backends/ceed-backend-list-hip.h @@ -0,0 +1,14 @@ +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. +// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +// +// SPDX-License-Identifier: BSD-2-Clause +// +// This file is part of CEED: http://github.com/ceed + +// This header does not have guards because it is included multiple times. +// This will be expanded inside CeedRegisterAll() to call each registration function. +// This is also used to create weakly linked registration functions in `backends/weak/ceed-*-weak.c'. + +CEED_BACKEND(CeedRegister_Hip, 1, "/gpu/hip/ref") +CEED_BACKEND(CeedRegister_Hip_Gen, 1, "/gpu/hip/gen") +CEED_BACKEND(CeedRegister_Hip_Shared, 1, "/gpu/hip/shared") diff --git a/backends/ceed-backend-list-magma.h b/backends/ceed-backend-list-magma.h new file mode 100644 index 0000000000..66c985c884 --- /dev/null +++ b/backends/ceed-backend-list-magma.h @@ -0,0 +1,13 @@ +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. +// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +// +// SPDX-License-Identifier: BSD-2-Clause +// +// This file is part of CEED: http://github.com/ceed + +// This header does not have guards because it is included multiple times. +// This will be expanded inside CeedRegisterAll() to call each registration function. +// This is also used to create weakly linked registration functions in `backends/weak/ceed-*-weak.c'. + +CEED_BACKEND(CeedRegister_Magma, 2, "/gpu/cuda/magma", "/gpu/hip/magma") +CEED_BACKEND(CeedRegister_Magma_Det, 2, "/gpu/cuda/magma/det", "/gpu/hip/magma/det") diff --git a/backends/ceed-backend-list-memcheck.h b/backends/ceed-backend-list-memcheck.h new file mode 100644 index 0000000000..fa6f51b0bb --- /dev/null +++ b/backends/ceed-backend-list-memcheck.h @@ -0,0 +1,13 @@ +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. +// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +// +// SPDX-License-Identifier: BSD-2-Clause +// +// This file is part of CEED: http://github.com/ceed + +// This header does not have guards because it is included multiple times. +// This will be expanded inside CeedRegisterAll() to call each registration function. +// This is also used to create weakly linked registration functions in `backends/weak/ceed-*-weak.c'. + +CEED_BACKEND(CeedRegister_Memcheck_Blocked, 1, "/cpu/self/memcheck/blocked") +CEED_BACKEND(CeedRegister_Memcheck_Serial, 1, "/cpu/self/memcheck/serial") diff --git a/backends/ceed-backend-list-ref.h b/backends/ceed-backend-list-ref.h new file mode 100644 index 0000000000..ac3e21525d --- /dev/null +++ b/backends/ceed-backend-list-ref.h @@ -0,0 +1,15 @@ +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. +// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +// +// SPDX-License-Identifier: BSD-2-Clause +// +// This file is part of CEED: http://github.com/ceed + +// This header does not have guards because it is included multiple times. +// This will be expanded inside CeedRegisterAll() to call each registration function. +// This is also used to create weakly linked registration functions in `backends/weak/ceed-*-weak.c'. + +CEED_BACKEND(CeedRegister_Ref, 1, "/cpu/self/ref/serial") +CEED_BACKEND(CeedRegister_Ref_Blocked, 1, "/cpu/self/ref/blocked") +CEED_BACKEND(CeedRegister_Opt_Blocked, 1, "/cpu/self/opt/blocked") +CEED_BACKEND(CeedRegister_Opt_Serial, 1, "/cpu/self/opt/serial") diff --git a/backends/ceed-backend-list-sycl.h b/backends/ceed-backend-list-sycl.h new file mode 100644 index 0000000000..88617e1b2b --- /dev/null +++ b/backends/ceed-backend-list-sycl.h @@ -0,0 +1,14 @@ +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. +// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +// +// SPDX-License-Identifier: BSD-2-Clause +// +// This file is part of CEED: http://github.com/ceed + +// This header does not have guards because it is included multiple times. +// This will be expanded inside CeedRegisterAll() to call each registration function. +// This is also used to create weakly linked registration functions in `backends/weak/ceed-*-weak.c'. + +CEED_BACKEND(CeedRegister_Sycl, 1, "/gpu/sycl/ref") +CEED_BACKEND(CeedRegister_Sycl_Shared, 1, "/gpu/sycl/shared") +CEED_BACKEND(CeedRegister_Sycl_Gen, 1, "/gpu/sycl/gen") diff --git a/backends/ceed-backend-list-xsmm.h b/backends/ceed-backend-list-xsmm.h new file mode 100644 index 0000000000..fee5f81102 --- /dev/null +++ b/backends/ceed-backend-list-xsmm.h @@ -0,0 +1,13 @@ +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. +// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +// +// SPDX-License-Identifier: BSD-2-Clause +// +// This file is part of CEED: http://github.com/ceed + +// This header does not have guards because it is included multiple times. +// This will be expanded inside CeedRegisterAll() to call each registration function. +// This is also used to create weakly linked registration functions in `backends/weak/ceed-*-weak.c'. + +CEED_BACKEND(CeedRegister_Xsmm_Blocked, 1, "/cpu/self/xsmm/blocked") +CEED_BACKEND(CeedRegister_Xsmm_Serial, 1, "/cpu/self/xsmm/serial") diff --git a/backends/ceed-backend-list.h b/backends/ceed-backend-list.h index 75b1d1fe75..77f8e34490 100644 --- a/backends/ceed-backend-list.h +++ b/backends/ceed-backend-list.h @@ -1,35 +1,29 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause // // This file is part of CEED: http://github.com/ceed -// This header does not have guards because it is included multiple times. +// This header does not have guards because it may be included multiple times. -// List each backend registration function once here. -// This will be expanded inside CeedRegisterAll() to call each registration function in the order listed, and also to define weak symbol aliases for -// backends that are not configured. +// List each backend registration function in the corresponding `ceed-backend-list-*.h` file, grouped by install requirement. +// Include each of those files here. +// This will be expanded inside CeedRegisterAll() to call each registration function in the order listed. -CEED_BACKEND(CeedRegister_Avx_Blocked, 1, "/cpu/self/avx/blocked") -CEED_BACKEND(CeedRegister_Avx_Serial, 1, "/cpu/self/avx/serial") -CEED_BACKEND(CeedRegister_Cuda, 1, "/gpu/cuda/ref") -CEED_BACKEND(CeedRegister_Cuda_Gen, 1, "/gpu/cuda/gen") -CEED_BACKEND(CeedRegister_Cuda_Shared, 1, "/gpu/cuda/shared") -CEED_BACKEND(CeedRegister_Hip, 1, "/gpu/hip/ref") -CEED_BACKEND(CeedRegister_Hip_Gen, 1, "/gpu/hip/gen") -CEED_BACKEND(CeedRegister_Hip_Shared, 1, "/gpu/hip/shared") -CEED_BACKEND(CeedRegister_Sycl, 1, "/gpu/sycl/ref") -CEED_BACKEND(CeedRegister_Sycl_Shared, 1, "/gpu/sycl/shared") -CEED_BACKEND(CeedRegister_Sycl_Gen, 1, "/gpu/sycl/gen") -CEED_BACKEND(CeedRegister_Magma, 2, "/gpu/cuda/magma", "/gpu/hip/magma") -CEED_BACKEND(CeedRegister_Magma_Det, 2, "/gpu/cuda/magma/det", "/gpu/hip/magma/det") -CEED_BACKEND(CeedRegister_Memcheck_Blocked, 1, "/cpu/self/memcheck/blocked") -CEED_BACKEND(CeedRegister_Memcheck_Serial, 1, "/cpu/self/memcheck/serial") -CEED_BACKEND(CeedRegister_Occa, 6, "/cpu/self/occa", "/cpu/openmp/occa", "/gpu/dpcpp/occa", "/gpu/opencl/occa", "/gpu/hip/occa", "/gpu/cuda/occa") -CEED_BACKEND(CeedRegister_Opt_Blocked, 1, "/cpu/self/opt/blocked") -CEED_BACKEND(CeedRegister_Opt_Serial, 1, "/cpu/self/opt/serial") -CEED_BACKEND(CeedRegister_Ref, 1, "/cpu/self/ref/serial") -CEED_BACKEND(CeedRegister_Ref_Blocked, 1, "/cpu/self/ref/blocked") -CEED_BACKEND(CeedRegister_Xsmm_Blocked, 1, "/cpu/self/xsmm/blocked") -CEED_BACKEND(CeedRegister_Xsmm_Serial, 1, "/cpu/self/xsmm/serial") +// Always compiled +#include "ceed-backend-list-ref.h" +// Requires AVX support +#include "ceed-backend-list-avx.h" +// Requires Valgrind +#include "ceed-backend-list-memcheck.h" +// Requires LIBXSMM +#include "ceed-backend-list-xsmm.h" +// Requires CUDA +#include "ceed-backend-list-cuda.h" +// Requires ROCm +#include "ceed-backend-list-hip.h" +// Requires SYCL +#include "ceed-backend-list-sycl.h" +// Requires MAGMA + (CUDA or ROCm) +#include "ceed-backend-list-magma.h" diff --git a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp index 8b2a8dfee5..7a36364d97 100644 --- a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp +++ b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -9,6 +9,7 @@ #include #include +#include #include #include @@ -22,360 +23,466 @@ #include "../cuda/ceed-cuda-compile.h" #include "ceed-cuda-gen.h" +struct FieldReuse_Cuda { + CeedInt index; + bool is_input; + CeedEvalMode eval_mode; +}; + //------------------------------------------------------------------------------ -// Build single operator kernel +// Determine type of operator //------------------------------------------------------------------------------ -extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) { - using std::ostringstream; - using std::string; - - bool is_setup_done, is_identity_qf; - struct cudaDeviceProp prop; - Ceed ceed; - Ceed_Cuda *ceed_data; - CeedSize l_size; - CeedInt Q, P_1d = 0, Q_1d = 0, elem_size, num_input_fields, num_output_fields, num_comp, dim = 1; - CeedEvalMode eval_mode; - CeedElemRestriction elem_rstr; - CeedElemRestriction_Cuda *rstr_data; - CeedBasis basis; - CeedBasis_Cuda_shared *basis_data; - CeedQFunctionField *qf_input_fields, *qf_output_fields; - CeedQFunction_Cuda_gen *qf_data; - CeedQFunction qf; - CeedOperatorField *op_input_fields, *op_output_fields; - CeedOperator_Cuda_gen *data; - - CeedCallBackend(CeedOperatorIsSetupDone(op, &is_setup_done)); - if (is_setup_done) return CEED_ERROR_SUCCESS; - - CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); - CeedCallBackend(CeedOperatorGetData(op, &data)); - CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); - CeedCallBackend(CeedQFunctionGetData(qf, &qf_data)); - CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q)); - Q_1d = Q; - CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); - CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); - - // TODO: put in a function? - // Check for restriction only identity operator - CeedCallBackend(CeedQFunctionIsIdentity(qf, &is_identity_qf)); - if (is_identity_qf) { - CeedEvalMode eval_mode_in, eval_mode_out; - - CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[0], &eval_mode_in)); - CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[0], &eval_mode_out)); - CeedCheck(eval_mode_in != CEED_EVAL_NONE || eval_mode_out != CEED_EVAL_NONE, ceed, CEED_ERROR_BACKEND, - "Backend does not implement restriction only identity operators"); - } - - ostringstream code; +static int CeedOperatorBuildKernelData_Cuda_gen(Ceed ceed, CeedInt num_input_fields, CeedOperatorField *op_input_fields, + CeedQFunctionField *qf_input_fields, CeedInt num_output_fields, CeedOperatorField *op_output_fields, + CeedQFunctionField *qf_output_fields, CeedInt *max_P, CeedInt *max_P_1d, CeedInt *Q, CeedInt *Q_1d, + CeedInt *max_dim, bool *is_all_tensor, bool *use_3d_slices) { + // Check if all are tensor + *is_all_tensor = true; + for (CeedInt i = 0; i < num_input_fields; i++) { + CeedBasis basis; - // TODO: put in a function? - // Add atomicAdd function for old NVidia architectures - CeedCallBackend(CeedGetData(ceed, &ceed_data)); - CeedCallBackend(cudaGetDeviceProperties(&prop, ceed_data->device_id)); - if ((prop.major < 6) && (CEED_SCALAR_TYPE != CEED_SCALAR_FP32)) { - char *atomic_add_source; - const char *atomic_add_path; + CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); + if (basis != CEED_BASIS_NONE) { + bool is_field_tensor; - CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-atomic-add-fallback.h", &atomic_add_path)); - CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Atomic Add Source -----\n"); - CeedCallBackend(CeedLoadSourceToBuffer(ceed, atomic_add_path, &atomic_add_source)); - code << atomic_add_source; - CeedCallBackend(CeedFree(&atomic_add_path)); - CeedCallBackend(CeedFree(&atomic_add_source)); + CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor)); + *is_all_tensor = *is_all_tensor && is_field_tensor; + } + CeedCallBackend(CeedBasisDestroy(&basis)); } + for (CeedInt i = 0; i < num_output_fields; i++) { + CeedBasis basis; - // Load basis source files - // TODO: generalize to accept different device functions? - { - char *tensor_basis_kernel_source; - const char *tensor_basis_kernel_path; - - CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h", &tensor_basis_kernel_path)); - CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Tensor Basis Kernel Source -----\n"); - CeedCallBackend(CeedLoadSourceToBuffer(ceed, tensor_basis_kernel_path, &tensor_basis_kernel_source)); - code << tensor_basis_kernel_source; - CeedCallBackend(CeedFree(&tensor_basis_kernel_path)); - CeedCallBackend(CeedFree(&tensor_basis_kernel_source)); - } - { - char *cuda_gen_template_source; - const char *cuda_gen_template_path; + CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); + if (basis != CEED_BASIS_NONE) { + bool is_field_tensor; - CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-gen-templates.h", &cuda_gen_template_path)); - CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Cuda-Gen Template Source -----\n"); - CeedCallBackend(CeedLoadSourceToBuffer(ceed, cuda_gen_template_path, &cuda_gen_template_source)); - code << cuda_gen_template_source; - CeedCallBackend(CeedFree(&cuda_gen_template_path)); - CeedCallBackend(CeedFree(&cuda_gen_template_source)); + CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor)); + *is_all_tensor = *is_all_tensor && is_field_tensor; + } + CeedCallBackend(CeedBasisDestroy(&basis)); } - // Get QFunction source and name - string qfunction_source(qf_data->qfunction_source); - string qfunction_name(qf_data->qfunction_name); - string operator_name; - operator_name = "CeedKernelCudaGenOperator_" + qfunction_name; + // Find max_P, max_P_1d, Q, and Q_1d + bool is_all_3d = true; - // Find dim, P_1d, Q_1d - data->max_P_1d = 0; + *max_P = 0; + *max_P_1d = 0; + *Q = 0; + *Q_1d = 0; for (CeedInt i = 0; i < num_input_fields; i++) { + CeedBasis basis; + CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); if (basis != CEED_BASIS_NONE) { - bool is_tensor; + bool is_field_tensor; + CeedInt field_dim = 0, field_P = 0, field_P_1d = 0, field_Q = 0, field_Q_1d = 0; - CeedCallBackend(CeedBasisGetData(basis, &basis_data)); - CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); + // Check if 3D + CeedCallBackend(CeedBasisGetDimension(basis, &field_dim)); + is_all_3d = is_all_3d && (field_dim == 3); + *max_dim = CeedIntMax(*max_dim, field_dim); - // Collect dim, P_1d, and Q_1d - CeedCallBackend(CeedBasisGetDimension(basis, &dim)); - CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor)); - CeedCheck(is_tensor, ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis"); - CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); - CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); - data->max_P_1d = CeedIntMax(data->max_P_1d, P_1d); + // Collect P, P_1d, Q, and Q_1d + CeedCallBackend(CeedBasisGetNumNodes(basis, &field_P)); + *max_P = CeedIntMax(*max_P, field_P); + CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor)); + if (is_field_tensor) { + CeedCallBackend(CeedBasisGetNumNodes1D(basis, &field_P_1d)); + *max_P_1d = CeedIntMax(*max_P_1d, field_P_1d); + } + CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &field_Q)); + CeedCheck(*Q == 0 || field_Q == *Q, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible"); + *Q = field_Q; + if (is_field_tensor) { + CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &field_Q_1d)); + CeedCheck(*Q_1d == 0 || field_Q_1d == *Q_1d, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible"); + *Q_1d = field_Q_1d; + } } + CeedCallBackend(CeedBasisDestroy(&basis)); } - // Check output bases for Q_1d, dim as well - // The only input basis might be CEED_BASIS_NONE for (CeedInt i = 0; i < num_output_fields; i++) { + CeedBasis basis; + CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); if (basis != CEED_BASIS_NONE) { - bool is_tensor; + bool is_field_tensor; + CeedInt field_dim = 0, field_P = 0, field_P_1d = 0, field_Q = 0, field_Q_1d = 0; - CeedCallBackend(CeedBasisGetData(basis, &basis_data)); - CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); + // Check if 3D + CeedCallBackend(CeedBasisGetDimension(basis, &field_dim)); + is_all_3d = is_all_3d && (field_dim == 3); + *max_dim = CeedIntMax(*max_dim, field_dim); - // Collect Q_1d - CeedCallBackend(CeedBasisGetDimension(basis, &dim)); - CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor)); - CeedCheck(is_tensor, ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis"); - CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); + // Collect P, P_1d, Q, and Q_1d + CeedCallBackend(CeedBasisGetNumNodes(basis, &field_P)); + *max_P = CeedIntMax(*max_P, field_P); + CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor)); + if (is_field_tensor) { + CeedCallBackend(CeedBasisGetNumNodes1D(basis, &field_P_1d)); + *max_P_1d = CeedIntMax(*max_P_1d, field_P_1d); + } + CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &field_Q)); + CeedCheck(*Q == 0 || field_Q == *Q, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible"); + *Q = field_Q; + if (is_field_tensor) { + CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &field_Q_1d)); + CeedCheck(*Q_1d == 0 || field_Q_1d == *Q_1d, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible"); + *Q_1d = field_Q_1d; + } } + CeedCallBackend(CeedBasisDestroy(&basis)); } - data->dim = dim; - data->Q_1d = Q_1d; // Only use 3D collocated gradient parallelization strategy when gradient is computed - // TODO: put in a function? - bool use_collograd_parallelization = false; - - if (dim == 3) { + *use_3d_slices = false; + if (is_all_3d && *is_all_tensor) { bool was_grad_found = false; for (CeedInt i = 0; i < num_input_fields; i++) { + CeedEvalMode eval_mode; + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); if (eval_mode == CEED_EVAL_GRAD) { + CeedBasis_Cuda_shared *basis_data; + CeedBasis basis; + CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); CeedCallBackend(CeedBasisGetData(basis, &basis_data)); - use_collograd_parallelization = basis_data->d_collo_grad_1d && (was_grad_found ? use_collograd_parallelization : true); - was_grad_found = true; + *use_3d_slices = basis_data->d_collo_grad_1d && (was_grad_found ? *use_3d_slices : true); + was_grad_found = true; + CeedCallBackend(CeedBasisDestroy(&basis)); } } for (CeedInt i = 0; i < num_output_fields; i++) { + CeedEvalMode eval_mode; + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); if (eval_mode == CEED_EVAL_GRAD) { + CeedBasis_Cuda_shared *basis_data; + CeedBasis basis; + CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); CeedCallBackend(CeedBasisGetData(basis, &basis_data)); - use_collograd_parallelization = basis_data->d_collo_grad_1d && (was_grad_found ? use_collograd_parallelization : true); - was_grad_found = true; + *use_3d_slices = basis_data->d_collo_grad_1d && (was_grad_found ? *use_3d_slices : true); + was_grad_found = true; + CeedCallBackend(CeedBasisDestroy(&basis)); } } } + return CEED_ERROR_SUCCESS; +} - // Define CEED_Q_VLA - code << "\n#undef CEED_Q_VLA\n"; - if (dim != 3 || use_collograd_parallelization) { - code << "#define CEED_Q_VLA 1\n\n"; - } else { - code << "#define CEED_Q_VLA " << Q_1d << "\n\n"; - } +//------------------------------------------------------------------------------ +// Setup fields +//------------------------------------------------------------------------------ +static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, Tab &tab, CeedInt i, + CeedOperatorField op_field, CeedQFunctionField qf_field, FieldReuse_Cuda field_reuse, + CeedInt max_dim, CeedInt Q, CeedInt Q_1d, bool is_input, bool is_all_tensor, bool is_at_points, + bool use_3d_slices, bool skip_active_load) { + bool is_tensor = true, is_active = true; + CeedBasis basis; - code << qfunction_source; + CeedCallBackend(CeedOperatorFieldGetBasis(op_field, &basis)); + if (basis != CEED_BASIS_NONE) CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor)); + { + CeedVector vec; - // Setup - code << "\n// -----------------------------------------------------------------------------\n"; - code << "\nextern \"C\" __global__ void " << operator_name - << "(CeedInt num_elem, void* ctx, FieldsInt_Cuda indices, Fields_Cuda fields, Fields_Cuda B, Fields_Cuda G, CeedScalar* W) {\n"; - for (CeedInt i = 0; i < num_input_fields; i++) { - CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); - if (eval_mode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT - code << " const CeedScalar* d_u_" << i << " = fields.inputs[" << i << "];\n"; - } + CeedCallBackend(CeedOperatorFieldGetVector(op_field, &vec)); + is_active = vec == CEED_VECTOR_ACTIVE; + CeedCallBackend(CeedVectorDestroy(&vec)); } - for (CeedInt i = 0; i < num_output_fields; i++) { - code << " CeedScalar* d_v_" << i << " = fields.outputs[" << i << "];\n"; - } + const char *field_name; + std::string var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i); + std::string P_name = (is_tensor ? "P_1d" : "P") + var_suffix, Q_name = is_tensor ? "Q_1d" : "Q"; + std::string option_name = (is_input ? "inputs" : "outputs"); + CeedEvalMode eval_mode = CEED_EVAL_NONE; + CeedInt elem_size = 0, num_comp = 0, dim = max_dim, P_1d = 0; + CeedElemRestriction elem_rstr; + CeedBasis_Cuda_shared *basis_data; - code << " const CeedInt dim = " << dim << ";\n"; - code << " const CeedInt Q_1d = " << Q_1d << ";\n"; + // Field reuse info + bool use_previous_field = field_reuse.index != -1; - code << " extern __shared__ CeedScalar slice[];\n"; - // TODO put in a function? InitSharedData_Cuda? - code << " SharedData_Cuda data;\n"; - code << " data.t_id_x = threadIdx.x;\n"; - code << " data.t_id_y = threadIdx.y;\n"; - code << " data.t_id_z = threadIdx.z;\n"; - code << " data.t_id = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n"; - code << " data.slice = slice+data.t_id_z*T_1D" << (dim > 1 ? "*T_1D" : "") << ";\n"; + CeedCallBackend(CeedOperatorFieldGetName(op_field, &field_name)); + code << tab << "// -- " << (is_input ? "Input" : "Output") << " field " << i << ": " << field_name << "\n"; - code << "\n // -- Input field constants and basis data --\n"; - // TODO: Put in a function? - // Initialize constants, and matrices B and G - for (CeedInt i = 0; i < num_input_fields; i++) { - code << " // ---- Input field " << i << " ----\n"; - // Get elem_size, eval_mode, num_comp - CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr)); + // Get field data + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr)); + if (elem_rstr != CEED_ELEMRESTRICTION_NONE) { CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); - CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); + } + CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); + if (basis != CEED_BASIS_NONE) { + CeedCallBackend(CeedBasisGetData(basis, &basis_data)); + CeedCallBackend(CeedBasisGetDimension(basis, &dim)); + if (is_tensor) CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); + else CeedCallBackend(CeedBasisGetNumNodes(basis, &P_1d)); + } + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode)); - // Set field constants - if (eval_mode != CEED_EVAL_WEIGHT) { - CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); - if (basis != CEED_BASIS_NONE) { - CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); - code << " const CeedInt P_in_" << i << " = " << P_1d << ";\n"; + // Set field constants + code << tab << "const CeedInt dim" << var_suffix << " = " << dim << ";\n"; + if (is_tensor && !is_all_tensor) { + CeedInt P = 0; + + CeedCallBackend(CeedBasisGetNumNodes(basis, &P)); + code << tab << "const CeedInt P" << var_suffix << " = " << (basis == CEED_BASIS_NONE ? Q : P) << ";\n"; + } + code << tab << "const CeedInt " << P_name << " = " << (basis == CEED_BASIS_NONE ? Q_1d : P_1d) << ";\n"; + if (eval_mode != CEED_EVAL_WEIGHT) { + code << tab << "const CeedInt num_comp" << var_suffix << " = " << num_comp << ";\n"; + } + + // Load basis data + code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n"; + switch (eval_mode) { + case CEED_EVAL_NONE: + break; + case CEED_EVAL_INTERP: + if (is_at_points) { + // AtPoints + if (!basis_data->d_chebyshev_interp_1d) { + CeedSize interp_bytes; + CeedScalar *chebyshev_interp_1d; + + interp_bytes = P_1d * Q_1d * sizeof(CeedScalar); + CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d)); + CeedCallBackend(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d)); + CeedCallCuda(CeedBasisReturnCeed(basis), cudaMalloc((void **)&basis_data->d_chebyshev_interp_1d, interp_bytes)); + CeedCallCuda(CeedBasisReturnCeed(basis), + cudaMemcpy(basis_data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, cudaMemcpyHostToDevice)); + CeedCallBackend(CeedFree(&chebyshev_interp_1d)); + } + if (is_input) data->B.inputs[i] = basis_data->d_chebyshev_interp_1d; + else data->B.outputs[i] = basis_data->d_chebyshev_interp_1d; } else { - code << " const CeedInt P_in_" << i << " = " << Q_1d << ";\n"; + // Standard quadrature + if (is_input) data->B.inputs[i] = basis_data->d_interp_1d; + else data->B.outputs[i] = basis_data->d_interp_1d; } - code << " const CeedInt num_comp_in_" << i << " = " << num_comp << ";\n"; - } + if (use_previous_field && !skip_active_load) { + std::string reuse_var = "s_B" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index)); - // Load basis data - code << " // EvalMode: " << CeedEvalModes[eval_mode] << "\n"; - switch (eval_mode) { - case CEED_EVAL_NONE: - break; - case CEED_EVAL_INTERP: - CeedCallBackend(CeedBasisGetData(basis, &basis_data)); - data->B.inputs[i] = basis_data->d_interp_1d; - code << " __shared__ CeedScalar s_B_in_" << i << "[" << P_1d * Q_1d << "];\n"; - code << " loadMatrix(data, B.inputs[" << i << "], s_B_in_" << i << ");\n"; - break; - case CEED_EVAL_GRAD: - CeedCallBackend(CeedBasisGetData(basis, &basis_data)); - data->B.inputs[i] = basis_data->d_interp_1d; - code << " __shared__ CeedScalar s_B_in_" << i << "[" << P_1d * Q_1d << "];\n"; - code << " loadMatrix(data, B.inputs[" << i << "], s_B_in_" << i << ");\n"; - if (use_collograd_parallelization) { - data->G.inputs[i] = basis_data->d_collo_grad_1d; - code << " __shared__ CeedScalar s_G_in_" << i << "[" << Q_1d * Q_1d << "];\n"; - code << " loadMatrix(data, G.inputs[" << i << "], s_G_in_" << i << ");\n"; + code << tab << "CeedScalar *s_B" << var_suffix << " = " << reuse_var << ";\n"; + } else { + bool is_collocated = false; + + CeedCallBackend(CeedBasisIsCollocated(basis, &is_collocated)); + if ((is_active && skip_active_load) || (is_collocated && !is_at_points)) { + code << tab << "CeedScalar *s_B" << var_suffix << " = NULL;\n"; } else { - bool has_collo_grad = basis_data->d_collo_grad_1d; - data->G.inputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d; - code << " __shared__ CeedScalar s_G_in_" << i << "[" << Q_1d * (has_collo_grad ? Q_1d : P_1d) << "];\n"; - code << " loadMatrix<" << (has_collo_grad ? "Q_1d" : ("P_in_" + std::to_string(i))) << ",Q_1d>(data, G.inputs[" << i << "], s_G_in_" << i - << ");\n"; + code << tab << "__shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n"; + code << tab << "LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n"; } - break; - case CEED_EVAL_WEIGHT: - break; // No action - case CEED_EVAL_DIV: - break; // TODO: Not implemented - case CEED_EVAL_CURL: - break; // TODO: Not implemented - } - } - - code << "\n // -- Output field constants and basis data --\n"; - for (CeedInt i = 0; i < num_output_fields; i++) { - code << " // ---- Output field " << i << " ----\n"; - // Get elem_size, eval_mode, num_comp - CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr)); - CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); - CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); - CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); + } + break; + case CEED_EVAL_GRAD: + if (is_at_points) { + // AtPoints + if (!basis_data->d_chebyshev_interp_1d) { + CeedSize interp_bytes; + CeedScalar *chebyshev_interp_1d; - // Set field constants - CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); - if (basis != CEED_BASIS_NONE) { - CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); - code << " const CeedInt P_out_" << i << " = " << P_1d << ";\n"; - } else { - code << " const CeedInt P_out_" << i << " = " << Q_1d << ";\n"; - } - code << " const CeedInt num_comp_out_" << i << " = " << num_comp << ";\n"; + interp_bytes = P_1d * Q_1d * sizeof(CeedScalar); + CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d)); + CeedCallBackend(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d)); + CeedCallCuda(CeedBasisReturnCeed(basis), cudaMalloc((void **)&basis_data->d_chebyshev_interp_1d, interp_bytes)); + CeedCallCuda(CeedBasisReturnCeed(basis), + cudaMemcpy(basis_data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, cudaMemcpyHostToDevice)); + CeedCallBackend(CeedFree(&chebyshev_interp_1d)); + } + if (is_input) data->B.inputs[i] = basis_data->d_chebyshev_interp_1d; + else data->B.outputs[i] = basis_data->d_chebyshev_interp_1d; + } else { + // Standard quadrature + if (is_input) data->B.inputs[i] = basis_data->d_interp_1d; + else data->B.outputs[i] = basis_data->d_interp_1d; + } + if (is_tensor) { + if (use_previous_field && !skip_active_load) { + std::string reuse_var = "s_B" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index)); - // Load basis data - code << " // EvalMode: " << CeedEvalModes[eval_mode] << "\n"; - switch (eval_mode) { - case CEED_EVAL_NONE: - break; // No action - case CEED_EVAL_INTERP: - CeedCallBackend(CeedBasisGetData(basis, &basis_data)); - data->B.outputs[i] = basis_data->d_interp_1d; - code << " __shared__ CeedScalar s_B_out_" << i << "[" << P_1d * Q_1d << "];\n"; - code << " loadMatrix(data, B.outputs[" << i << "], s_B_out_" << i << ");\n"; - break; - case CEED_EVAL_GRAD: - CeedCallBackend(CeedBasisGetData(basis, &basis_data)); - data->B.outputs[i] = basis_data->d_interp_1d; - code << " __shared__ CeedScalar s_B_out_" << i << "[" << P_1d * Q_1d << "];\n"; - code << " loadMatrix(data, B.outputs[" << i << "], s_B_out_" << i << ");\n"; - if (use_collograd_parallelization) { - data->G.outputs[i] = basis_data->d_collo_grad_1d; - code << " __shared__ CeedScalar s_G_out_" << i << "[" << Q_1d * Q_1d << "];\n"; - code << " loadMatrix(data, G.outputs[" << i << "], s_G_out_" << i << ");\n"; + code << tab << "CeedScalar *s_B" << var_suffix << " = " << reuse_var << ";\n"; } else { - bool has_collo_grad = basis_data->d_collo_grad_1d; - data->G.outputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d; - code << " __shared__ CeedScalar s_G_out_" << i << "[" << Q_1d * (has_collo_grad ? Q_1d : P_1d) << "];\n"; - code << " loadMatrix<" << (has_collo_grad ? "Q_1d" : ("P_out_" + std::to_string(i))) << ",Q_1d>(data, G.outputs[" << i << "], s_G_out_" - << i << ");\n"; + bool is_collocated = false; + + CeedCallBackend(CeedBasisIsCollocated(basis, &is_collocated)); + if ((is_active && skip_active_load) || (is_collocated && !is_at_points)) { + code << tab << "CeedScalar *s_B" << var_suffix << " = NULL;\n"; + } else { + code << tab << "__shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n"; + code << tab << "LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n"; + } } - break; - // LCOV_EXCL_START - case CEED_EVAL_WEIGHT: { - return CeedError(CeedOperatorReturnCeed(op), CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); - break; // Should not occur } - case CEED_EVAL_DIV: - case CEED_EVAL_CURL: { - return CeedError(CeedOperatorReturnCeed(op), CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]); - break; // Should not occur + if (is_at_points) break; // No G mat for AtPoints + if (use_3d_slices) { + if (is_input) data->G.inputs[i] = basis_data->d_collo_grad_1d; + else data->G.outputs[i] = basis_data->d_collo_grad_1d; + if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD && !skip_active_load) { + std::string reuse_var = "s_G" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index)); + + code << tab << "CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n"; + } else if (is_active && skip_active_load) { + code << tab << "CeedScalar *s_G" << var_suffix << " = NULL;\n"; + } else { + code << tab << "__shared__ CeedScalar s_G" << var_suffix << "[" << Q_name << "*" << Q_name << "];\n"; + code << tab << "LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n"; + } + } else { + bool has_collo_grad = basis_data->d_collo_grad_1d; + + if (is_input) data->G.inputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d; + else data->G.outputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d; + if (has_collo_grad) { + if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD && !skip_active_load) { + std::string reuse_var = "s_G" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index)); + + code << tab << "CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n"; + } else if (is_active && skip_active_load) { + code << tab << "CeedScalar *s_G" << var_suffix << " = NULL;\n"; + } else { + code << tab << "__shared__ CeedScalar s_G" << var_suffix << "[" << Q_name << "*" << Q_name << "];\n"; + code << tab << "LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n"; + } + } else { + if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD && !skip_active_load) { + std::string reuse_var = "s_G" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index)); + + code << tab << "CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n"; + } else if (is_active && skip_active_load) { + code << tab << "CeedScalar *s_G" << var_suffix << " = NULL;\n"; + } else { + code << tab << "__shared__ CeedScalar s_G" << var_suffix << "[" << P_name << "*" << Q_name << (is_tensor ? "" : "*dim") + << (is_tensor ? "" : var_suffix) << "];\n"; + code << tab << "LoadMatrix<" << P_name << ", " << Q_name << (is_tensor ? "" : "*dim") << (is_tensor ? "" : var_suffix) << ">(data, G." + << option_name << "[" << i << "], s_G" << var_suffix << ");\n"; + } + } } - // LCOV_EXCL_STOP - } + break; + case CEED_EVAL_WEIGHT: + break; // No action + // LCOV_EXCL_START + case CEED_EVAL_DIV: + case CEED_EVAL_CURL: + break; // TODO: Not implemented + // LCOV_EXCL_STOP } - code << "\n // -- Element loop --\n"; - code << " __syncthreads();\n"; - code << " for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {\n"; - // Input basis apply if needed - // Generate the correct eval mode code for each input - code << " // -- Input field restrictions and basis actions --\n"; - for (CeedInt i = 0; i < num_input_fields; i++) { - code << " // ---- Input field " << i << " ----\n"; - // Get elem_size, eval_mode, num_comp - CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr)); + CeedCallBackend(CeedBasisDestroy(&basis)); + return CEED_ERROR_SUCCESS; +} + +//------------------------------------------------------------------------------ +// Restriction +//------------------------------------------------------------------------------ +static int CeedOperatorBuildKernelRestriction_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, Tab &tab, CeedInt i, + CeedInt field_input_buffer[], CeedOperatorField op_field, CeedQFunctionField qf_field, + CeedInt max_dim, CeedInt Q_1d, bool is_input, bool is_all_tensor, bool is_at_points, + bool use_3d_slices) { + std::string var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i); + std::string P_name = (is_all_tensor ? "P_1d" : "P") + var_suffix; + CeedEvalMode eval_mode = CEED_EVAL_NONE; + CeedInt elem_size = 0, num_comp = 0; + CeedSize l_size; + CeedRestrictionType rstr_type = CEED_RESTRICTION_STANDARD; + CeedElemRestriction_Cuda *rstr_data; + CeedElemRestriction elem_rstr; + + // Get field data + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr)); + if (elem_rstr != CEED_ELEMRESTRICTION_NONE) { + CeedCallBackend(CeedElemRestrictionGetType(elem_rstr, &rstr_type)); CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); - CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); + CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_data)); + } + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode)); + + // Restriction + if (is_input) { + // Input + if (field_input_buffer[i] != i) { + std::string buffer_name = "r_e_in_" + std::to_string(field_input_buffer[i]); + + // Restriction was already done for previous input + code << tab << "CeedScalar *r_e" << var_suffix << " = " << buffer_name << ";\n"; + } else if (eval_mode != CEED_EVAL_WEIGHT && !((eval_mode == CEED_EVAL_NONE) && use_3d_slices && is_at_points)) { + if (eval_mode == CEED_EVAL_NONE && rstr_type != CEED_RESTRICTION_POINTS) { + // No basis action, so r_e_in_* in also r_q_in_* and needs to be allocated + code << tab << "CeedScalar r_e" << var_suffix << "[num_comp" << var_suffix << "*" << P_name << "];\n"; + } else if (rstr_type != CEED_RESTRICTION_POINTS) { + // Otherwise we're using the scratch space + code << tab << "CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n"; + } + switch (rstr_type) { + case CEED_RESTRICTION_STANDARD: { + CeedInt comp_stride; + + CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size)); + code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n"; + CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride)); + code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n"; + data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets; + code << tab << "ReadLVecStandard" << (is_all_tensor ? max_dim : 1) << "d(data, l_size" << var_suffix << ", elem, indices.inputs[" << i << "], d" << var_suffix << ", r_e" << var_suffix + << ");\n"; + break; + } + case CEED_RESTRICTION_STRIDED: { + bool has_backend_strides; + CeedInt num_elem; - // TODO: put in a function? - // Restriction - if (eval_mode != CEED_EVAL_WEIGHT && !((eval_mode == CEED_EVAL_NONE) && use_collograd_parallelization)) { - code << " CeedScalar r_u_" << i << "[num_comp_in_" << i << "*P_in_" << i << "];\n"; + CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides)); + CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem)); + CeedInt strides[3] = {1, elem_size * num_elem, elem_size}; - bool is_strided; + if (!has_backend_strides) { + CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides)); + } + code << tab << "const CeedInt strides" << var_suffix << "_0 = " << strides[0] << ", strides" << var_suffix << "_1 = " << strides[1] + << ", strides" << var_suffix << "_2 = " << strides[2] << ";\n"; + code << tab << "ReadLVecStrided" << (is_all_tensor ? max_dim : 1) << "d(data, elem, d" << var_suffix << ", r_e" + << var_suffix << ");\n"; + break; + } + case CEED_RESTRICTION_POINTS: { + CeedInt comp_stride; - CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided)); - if (!is_strided) { + CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride)); + code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n"; + data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets; + break; + } + // LCOV_EXCL_START + case CEED_RESTRICTION_ORIENTED: + case CEED_RESTRICTION_CURL_ORIENTED: + break; // TODO: Not implemented + // LCOV_EXCL_STOP + } + } + } else { + // Output + switch (rstr_type) { + case CEED_RESTRICTION_STANDARD: { CeedInt comp_stride; CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size)); - code << " const CeedInt l_size_in_" << i << " = " << l_size << ";\n"; + code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n"; CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride)); - code << " // CompStride: " << comp_stride << "\n"; - CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_data)); - data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets; - code << " readDofsOffset" << dim << "d(data, l_size_in_" << i - << ", elem, indices.inputs[" << i << "], d_u_" << i << ", r_u_" << i << ");\n"; - } else { + code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n"; + data->indices.outputs[i] = (CeedInt *)rstr_data->d_offsets; + code << tab << "WriteLVecStandard" << (is_all_tensor ? max_dim : 1) << "d(data, l_size" << var_suffix << ", elem, indices.outputs[" << i << "], r_e" << var_suffix << ", d" << var_suffix + << ");\n"; + break; + } + case CEED_RESTRICTION_STRIDED: { bool has_backend_strides; CeedInt num_elem; @@ -386,328 +493,2202 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) { if (!has_backend_strides) { CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides)); } - code << " // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n"; - code << " readDofsStrided" << dim << "d(data, elem, d_u_" << i << ", r_u_" << i << ");\n"; + code << tab << "const CeedInt strides" << var_suffix << "_0 = " << strides[0] << ", strides" << var_suffix << "_1 = " << strides[1] + << ", strides" << var_suffix << "_2 = " << strides[2] << ";\n"; + code << tab << "WriteLVecStrided" << (is_all_tensor ? max_dim : 1) << "d(data, elem, r_e" << var_suffix << ", d" << var_suffix + << ");\n"; + break; } + case CEED_RESTRICTION_POINTS: + data->indices.outputs[i] = (CeedInt *)rstr_data->d_offsets; + break; + // LCOV_EXCL_START + case CEED_RESTRICTION_ORIENTED: + case CEED_RESTRICTION_CURL_ORIENTED: + break; // TODO: Not implemented + // LCOV_EXCL_STOP } + } + CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); + return CEED_ERROR_SUCCESS; +} + +//------------------------------------------------------------------------------ +// Basis +//------------------------------------------------------------------------------ +static int CeedOperatorBuildKernelBasis_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, Tab &tab, CeedInt i, + CeedOperatorField op_field, CeedQFunctionField qf_field, CeedInt max_dim, CeedInt Q_1d, + bool is_input, bool is_all_tensor, bool is_at_points, bool use_3d_slices) { + bool is_tensor = true, is_collocated = true; + CeedBasis basis; + CeedCallBackend(CeedOperatorFieldGetBasis(op_field, &basis)); + CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor)); + CeedCallBackend(CeedBasisIsCollocated(basis, &is_collocated)); + + std::string var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i); + std::string P_name = (is_tensor ? "P_1d" : "P") + var_suffix, Q_name = is_tensor ? "Q_1d" : "Q"; + CeedEvalMode eval_mode = CEED_EVAL_NONE; + CeedInt dim = max_dim, elem_size = 0, num_comp = 0, P_1d = 0; + CeedElemRestriction elem_rstr; + + // Get field data + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr)); + if (elem_rstr != CEED_ELEMRESTRICTION_NONE) { + CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); + CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); + } + CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); + if (basis != CEED_BASIS_NONE) { + CeedCallBackend(CeedBasisGetDimension(basis, &dim)); + if (is_tensor) CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); + else CeedCallBackend(CeedBasisGetNumNodes(basis, &P_1d)); + } + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode)); - // TODO: put in a function? - // Basis action - code << " // EvalMode: " << CeedEvalModes[eval_mode] << "\n"; + // Basis + code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n"; + if (is_input) { switch (eval_mode) { case CEED_EVAL_NONE: - if (!use_collograd_parallelization) { - code << " CeedScalar* r_t_" << i << " = r_u_" << i << ";\n"; + if (!use_3d_slices && !is_at_points) { + code << tab << "CeedScalar *r_q" << var_suffix << " = r_e" << var_suffix << ";\n"; } break; case CEED_EVAL_INTERP: - code << " CeedScalar r_t_" << i << "[num_comp_in_" << i << "*Q_1d];\n"; - code << " Interp" << (dim > 1 ? "Tensor" : "") << dim << "d(data, r_u_" << i << ", s_B_in_" - << i << ", r_t_" << i << ");\n"; + if (is_at_points) { + std::string function_name = (dim == 1 ? "Interp" : "InterpTensor") + std::to_string(dim) + "d"; + + code << tab << "CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim >= 3 ? Q_name : "1") << "];\n"; + code << tab << function_name << "(data, r_e" << var_suffix + << ", s_B" << var_suffix << ", r_c" << var_suffix << ");\n"; + } else { + std::string function_name = is_tensor ? ((dim == 1 ? "Interp" : "InterpTensor") + std::string(is_collocated ? "CollocatedNodes" : "") + + std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened")) + : "InterpNonTensor"; + std::string op_t_1d_name = (is_all_tensor || !is_tensor) ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name); + + code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_all_tensor && (dim >= 3) ? Q_name : "1") << "];\n"; + code << tab << function_name << "(data, r_e" + << var_suffix << ", s_B" << var_suffix << ", r_q" << var_suffix << ");\n"; + } break; case CEED_EVAL_GRAD: - if (use_collograd_parallelization) { - code << " CeedScalar r_t_" << i << "[num_comp_in_" << i << "*Q_1d];\n"; - code << " Interp" << (dim > 1 ? "Tensor" : "") << dim << "d(data, r_u_" << i - << ", s_B_in_" << i << ", r_t_" << i << ");\n"; + if (is_at_points) { + std::string function_name = (dim == 1 ? "Interp" : "InterpTensor") + std::to_string(dim) + "d"; + + code << tab << "CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim >= 3 ? Q_name : "1") << "];\n"; + code << tab << function_name << "(data, r_e" << var_suffix + << ", s_B" << var_suffix << ", r_c" << var_suffix << ");\n"; + } else if (use_3d_slices) { + std::string function_name = + (dim > 1 ? "InterpTensor" : "Interp") + std::string(is_collocated ? "CollocatedNodes" : "") + std::to_string(dim) + "d"; + + code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n"; + code << tab << function_name << "(data, r_e" << var_suffix + << ", s_B" << var_suffix << ", r_q" << var_suffix << ");\n"; + } else if (is_tensor) { + bool is_collocated_grad = dim == 3 && Q_1d >= P_1d; + std::string function_name = + (dim == 1 ? "Grad" : ("GradTensor" + std::string(is_collocated ? "CollocatedNodes" : (is_collocated_grad ? "Collocated" : "")))) + + std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened"); + std::string op_t_1d_name = is_all_tensor ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name); + + code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "*" + << (is_all_tensor && dim >= 3 ? Q_name : "1") << "];\n"; + code << tab << function_name << "(data, r_e" + << var_suffix << ", s_B" << var_suffix << ", s_G" << var_suffix << ", r_q" << var_suffix << ");\n"; } else { - CeedInt P_1d; + std::string function_name = "GradNonTensor"; - CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); - CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); - code << " CeedScalar r_t_" << i << "[num_comp_in_" << i << "*dim*Q_1d];\n"; - code << " Grad" << (dim > 1 ? "Tensor" : "") << (dim == 3 && Q_1d >= P_1d ? "Collocated" : "") << dim << "d(data, r_u_" << i << ", s_B_in_" << i << ", s_G_in_" << i << ", r_t_" << i << ");\n"; + code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n"; + code << tab << function_name << "(data, r_e" << var_suffix << ", s_G" << var_suffix << ", r_q" << var_suffix << ");\n"; } break; - case CEED_EVAL_WEIGHT: - code << " CeedScalar r_t_" << i << "[Q_1d];\n"; - CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); - CeedCallBackend(CeedBasisGetData(basis, &basis_data)); - data->W = basis_data->d_q_weight_1d; - code << " Weight" << (dim > 1 ? "Tensor" : "") << dim << "d(data, W, r_t_" << i << ");\n"; - break; // No action + case CEED_EVAL_WEIGHT: { + if (is_at_points) { + code << tab << "// Nothing to do AtPoints\n"; + } else { + CeedBasis_Cuda_shared *basis_data; + std::string function_name = is_tensor + ? ((dim == 1 ? "Weight" : "WeightTensor") + std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened")) + : "WeightNonTensor"; + + code << tab << "CeedScalar r_q" << var_suffix << "[" << (is_all_tensor && (dim >= 3) ? Q_name : "1") << "];\n"; + CeedCallBackend(CeedBasisGetData(basis, &basis_data)); + data->W = basis_data->d_q_weight_1d; + code << tab << function_name << "<" << P_name << ", " << Q_name << ">(data, W, r_q" << var_suffix << ");\n"; + } + break; + } + // LCOV_EXCL_START case CEED_EVAL_DIV: - break; // TODO: Not implemented case CEED_EVAL_CURL: break; // TODO: Not implemented + // LCOV_EXCL_STOP } - } + } else { + switch (eval_mode) { + case CEED_EVAL_NONE: + code << tab << "CeedScalar *r_e" << var_suffix << " = r_q" << var_suffix << ";\n"; + break; // No action + case CEED_EVAL_INTERP: + code << tab << "CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n"; + if (is_at_points) { + std::string function_name = (dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d"; - // TODO: put in a function + separate collograd logic - // Q function - code << "\n // -- Output field setup --\n"; - for (CeedInt i = 0; i < num_output_fields; i++) { - code << "\n // ---- Output field " << i << " ----\n"; - CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); - if (eval_mode == CEED_EVAL_GRAD) { - if (use_collograd_parallelization) { - // Accumulator for gradient slices - code << " CeedScalar r_tt_" << i << "[num_comp_out_" << i << "*Q_1d];\n"; - code << " for (CeedInt i = 0; i < num_comp_out_" << i << "; i++) {\n"; - code << " for (CeedInt j = 0; j < Q_1d; ++j) {\n"; - code << " r_tt_" << i << "[j + i*Q_1d] = 0.0;\n"; - code << " }\n"; - code << " }\n"; - } else { - code << " CeedScalar r_tt_" << i << "[num_comp_out_" << i << "*dim*Q_1d];\n"; - } - } - if (eval_mode == CEED_EVAL_NONE || eval_mode == CEED_EVAL_INTERP) { - code << " CeedScalar r_tt_" << i << "[num_comp_out_" << i << "*Q_1d];\n"; - } - } - // We treat quadrature points per slice in 3d to save registers - if (use_collograd_parallelization) { - code << "\n // Note: Using planes of 3D elements\n"; - code << "#pragma unroll\n"; - code << " for (CeedInt q = 0; q < Q_1d; q++) {\n"; - code << " // -- Input fields --\n"; - for (CeedInt i = 0; i < num_input_fields; i++) { - code << " // ---- Input field " << i << " ----\n"; - // Get elem_size, eval_mode, num_comp - CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); - // Basis action - code << " // EvalMode: " << CeedEvalModes[eval_mode] << "\n"; - switch (eval_mode) { - case CEED_EVAL_NONE: - bool is_strided; + code << tab << function_name << "(data, r_c" << var_suffix + << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n"; + } else { + std::string function_name = + is_tensor ? ((dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::string(is_collocated ? "CollocatedNodes" : "") + + std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened")) + : "InterpTransposeNonTensor"; + std::string op_t_1d_name = (is_all_tensor || !is_tensor) ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name); - code << " CeedScalar r_q_" << i << "[num_comp_in_" << i << "];\n"; + code << tab << function_name << "(data, r_q" + << var_suffix << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n"; + } + break; + case CEED_EVAL_GRAD: + code << tab << "CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n"; + if (is_at_points) { + std::string function_name = (dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d"; - CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr)); - CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided)); - if (!is_strided) { - CeedInt comp_stride; + code << tab << function_name << "(data, r_c" << var_suffix + << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n"; + } else if (use_3d_slices) { + std::string function_name = (dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::string(is_collocated ? "CollocatedNodes" : "") + + std::to_string(dim) + "d"; - CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size)); - code << " const CeedInt l_size_in_" << i << " = " << l_size << ";\n"; - CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride)); - code << " // CompStride: " << comp_stride << "\n"; - CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_data)); - data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets; - code << " readSliceQuadsOffset" - << "3d(data, l_size_in_" << i << ", elem, q, indices.inputs[" << i << "], d_u_" - << i << ", r_q_" << i << ");\n"; - } else { - bool has_backend_strides; - CeedInt num_elem; + code << tab << function_name << "(data, r_q" << var_suffix + << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n"; + } else if (is_tensor) { + bool is_collocated_grad = dim == 3 && Q_1d >= P_1d; + std::string function_name = + (dim == 1 ? "GradTranspose" + : ("GradTransposeTensor" + std::string(is_collocated ? "CollocatedNodes" : (is_collocated_grad ? "Collocated" : "")))) + + std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened"); + std::string op_t_1d_name = is_all_tensor ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name); - CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); - CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides)); - CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem)); - CeedInt strides[3] = {1, elem_size * num_elem, elem_size}; + code << tab << function_name << "(data, r_q" + << var_suffix << ", s_B" << var_suffix << ", s_G" << var_suffix << ", r_e" << var_suffix << ");\n"; + } else { + std::string function_name = "GradTransposeNonTensor"; - if (!has_backend_strides) { - CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides)); - } - code << " // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n"; - code << " readSliceQuadsStrided" - << "3d(data, elem, q, d_u_" << i << ", r_q_" << i << ");\n"; - } - break; - case CEED_EVAL_INTERP: - code << " CeedScalar r_q_" << i << "[num_comp_in_" << i << "];\n"; - code << " for (CeedInt j = 0; j < num_comp_in_" << i << " ; ++j) {\n"; - code << " r_q_" << i << "[j] = r_t_" << i << "[q + j*Q_1d];\n"; - code << " }\n"; - break; - case CEED_EVAL_GRAD: - code << " CeedScalar r_q_" << i << "[num_comp_in_" << i << "*dim];\n"; - code << " gradCollo3d(data, q, r_t_" << i << ", s_G_in_" << i << ", r_q_" << i << ");\n"; - break; - case CEED_EVAL_WEIGHT: - code << " CeedScalar r_q_" << i << "[1];\n"; - code << " r_q_" << i << "[0] = r_t_" << i << "[q];\n"; - break; // No action - case CEED_EVAL_DIV: - break; // TODO: Not implemented - case CEED_EVAL_CURL: + code << tab << function_name << "(data, r_q" << var_suffix << ", s_G" << var_suffix << ", r_e" << var_suffix << ");\n"; + } + break; + // LCOV_EXCL_START + case CEED_EVAL_WEIGHT: + break; // Should not occur + case CEED_EVAL_DIV: + case CEED_EVAL_CURL: + break; // TODO: Not implemented + // LCOV_EXCL_STOP + } + } + CeedCallBackend(CeedBasisDestroy(&basis)); + return CEED_ERROR_SUCCESS; +} + +//------------------------------------------------------------------------------ +// QFunction +//------------------------------------------------------------------------------ +static int CeedOperatorBuildKernelQFunction_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, Tab &tab, CeedInt max_dim, + CeedInt max_num_points, CeedInt num_input_fields, CeedOperatorField *op_input_fields, + CeedQFunctionField *qf_input_fields, CeedInt num_output_fields, + CeedOperatorField *op_output_fields, CeedQFunctionField *qf_output_fields, + std::string qfunction_name, CeedInt Q_1d, bool is_all_tensor, bool is_at_points, + bool use_3d_slices, bool is_assemble) { + std::string Q_name = is_all_tensor ? "Q_1d" : "Q"; + CeedEvalMode eval_mode = CEED_EVAL_NONE; + CeedElemRestriction elem_rstr; + + // Setup output arrays + code << "\n"; + code << tab << "// -- Output field setup\n"; + for (CeedInt i = 0; i < num_output_fields; i++) { + const char *field_name; + std::string var_suffix = "_out_" + std::to_string(i); + + CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name)); + code << tab << "// ---- Output field " << i << ": " << field_name << "\n"; + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); + switch (eval_mode) { + case CEED_EVAL_NONE: + if (is_at_points) { + code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "];\n"; + } else { + code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_all_tensor && (max_dim >= 3) ? Q_name : "1") + << "];\n"; + } + break; + case CEED_EVAL_INTERP: + if (is_at_points) { + // Accumulator for point data + code << tab << "CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "];\n"; + code << tab << "for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "; i++) r_c" << var_suffix + << "[i] = 0.0;\n"; + } else { + code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_all_tensor && (max_dim >= 3) ? Q_name : "1") + << "];\n"; + } + break; + case CEED_EVAL_GRAD: + if (is_at_points) { + // Accumulator for point data + code << tab << "CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "];\n"; + code << tab << "for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "; i++) r_c" << var_suffix + << "[i] = 0.0;\n"; + } else if (use_3d_slices) { + // Accumulator for gradient slices + code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n"; + code << tab << "for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << Q_name << "; i++) r_q" << var_suffix << "[i] = 0.0;\n"; + } else { + code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "*" + << (is_all_tensor && (max_dim >= 3) ? Q_name : "1") << "];\n"; + } + break; + case CEED_EVAL_WEIGHT: + break; + // LCOV_EXCL_START + case CEED_EVAL_DIV: + case CEED_EVAL_CURL: + break; // TODO: Not implemented + // LCOV_EXCL_STOP + } + } + + if (is_at_points) { + // We need to handle batches of points + code << "\n"; + code << tab << "// Note: Using batches of points\n"; + code << tab << "const CeedInt point_loop_bound = (blockDim.x*blockDim.y) * ceil((1.0*max_num_points) / (blockDim.x*blockDim.y));\n\n"; + code << tab << "#pragma unroll\n"; + code << tab << "for (CeedInt i = threadIdx.x + threadIdx.y*blockDim.x; i < point_loop_bound; i += blockDim.x*blockDim.y) {\n"; + tab.push(); + code << tab << "const CeedInt p = i % max_num_points;\n\n"; + + code << tab << "// -- Coordinates\n"; + code << tab << "CeedScalar r_x[max_dim];\n"; + code << tab << "ReadPoint(data, elem, p, max_num_points, points.indices, points.coords, r_x);\n\n"; + + code << tab << "// -- Input fields\n"; + for (CeedInt i = 0; i < num_input_fields; i++) { + const char *field_name; + std::string var_suffix = "_in_" + std::to_string(i); + std::string P_name = "P_1d" + var_suffix; + + CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name)); + code << tab << "// ---- Input field " << i << ": " << field_name << "\n"; + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); + // Basis action + code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n"; + switch (eval_mode) { + case CEED_EVAL_NONE: + code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n"; + code << tab << "ReadPoint(data, elem, p, max_num_points, indices.inputs[" << i << "], d" << var_suffix << ", r_s" << var_suffix << ");\n"; + break; + case CEED_EVAL_INTERP: + code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n"; + code << tab << "InterpAtPoints" << max_dim << "d(data, i, r_c" << var_suffix << ", r_x, r_s" << var_suffix << ");\n"; + break; + case CEED_EVAL_GRAD: + code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n"; + code << tab << "GradAtPoints" << max_dim << "d(data, i, r_c" << var_suffix << ", r_x, r_s" << var_suffix << ");\n"; + break; + case CEED_EVAL_WEIGHT: + code << tab << "CeedScalar r_s" << var_suffix << "[1];\n"; + code << tab << "r_s" << var_suffix << "[0] = 1.0;\n"; + break; + // LCOV_EXCL_START + case CEED_EVAL_DIV: + case CEED_EVAL_CURL: break; // TODO: Not implemented + // LCOV_EXCL_STOP } } - code << "\n // -- Output fields --\n"; + code << "\n"; + code << tab << "// -- Output fields\n"; for (CeedInt i = 0; i < num_output_fields; i++) { - code << " // ---- Output field " << i << " ----\n"; + const char *field_name; + std::string var_suffix = "_out_" + std::to_string(i); + + CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name)); + code << tab << "// ---- Output field " << i << ": " << field_name << "\n"; CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); // Basis action switch (eval_mode) { case CEED_EVAL_NONE: - code << " CeedScalar r_qq_" << i << "[num_comp_out_" << i << "];\n"; - break; // No action + code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n"; + break; case CEED_EVAL_INTERP: - code << " CeedScalar r_qq_" << i << "[num_comp_out_" << i << "];\n"; + code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n"; break; case CEED_EVAL_GRAD: - code << " CeedScalar r_qq_" << i << "[num_comp_out_" << i << "*dim];\n"; + code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n"; break; + // LCOV_EXCL_START case CEED_EVAL_WEIGHT: break; // Should not occur case CEED_EVAL_DIV: + case CEED_EVAL_CURL: + break; // TODO: Not implemented + // LCOV_EXCL_STOP + } + } + + } else if (use_3d_slices) { + // We treat quadrature points per slice in 3d to save registers + code << "\n"; + code << tab << "// Note: Using planes of 3D elements\n"; + code << tab << "#pragma unroll\n"; + code << tab << "for (CeedInt q = 0; q < " << Q_name << "; q++) {\n"; + tab.push(); + code << tab << "// -- Input fields\n"; + for (CeedInt i = 0; i < num_input_fields; i++) { + const char *field_name; + std::string var_suffix = "_in_" + std::to_string(i); + + CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name)); + code << tab << "// ---- Input field " << i << ": " << field_name << "\n"; + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); + // Basis action + code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n"; + switch (eval_mode) { + case CEED_EVAL_NONE: + bool is_strided; + + code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n"; + + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr)); + CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided)); + if (is_strided) { + bool has_backend_strides; + CeedInt num_elem, elem_size; + + CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); + CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides)); + CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem)); + CeedInt strides[3] = {1, elem_size * num_elem, elem_size}; + + if (!has_backend_strides) { + CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides)); + } + code << tab << "const CeedInt strides" << var_suffix << "_0 = " << strides[0] << ", strides" << var_suffix << "_1 = " << strides[1] + << ", strides" << var_suffix << "_2 = " << strides[2] << ";\n"; + code << tab << "ReadEVecSliceStrided3d(data, elem, q, d" << var_suffix << ", r_s" << var_suffix << ");\n"; + } else { + CeedSize l_size = 0; + CeedInt comp_stride; + CeedElemRestriction_Cuda *rstr_data; + + CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size)); + code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n"; + CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride)); + code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n"; + CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_data)); + data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets; + code << tab << "ReadEVecSliceStandard3d(data, l_size" + << var_suffix << ", elem, q, indices.inputs[" << i << "], d" << var_suffix << ", r_s" << var_suffix << ");\n"; + } + CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); + break; + case CEED_EVAL_INTERP: + code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n"; + code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << "; j++) {\n"; + tab.push(); + code << tab << "r_s" << var_suffix << "[j] = r_q" << var_suffix << "[q + j*" << Q_name << "];\n"; + tab.pop(); + code << tab << "}\n"; + break; + case CEED_EVAL_GRAD: + code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n"; + code << tab << "GradColloSlice3d(data, q, r_q" << var_suffix << ", s_G" + << var_suffix << ", r_s" << var_suffix << ");\n"; + break; + case CEED_EVAL_WEIGHT: + code << tab << "CeedScalar r_s" << var_suffix << "[1];\n"; + code << tab << "r_s" << var_suffix << "[0] = r_q" << var_suffix << "[q];\n"; + break; + // LCOV_EXCL_START + case CEED_EVAL_DIV: + case CEED_EVAL_CURL: break; // TODO: Not implemented + // LCOV_EXCL_STOP + } + } + code << "\n"; + code << tab << "// -- Output fields\n"; + for (CeedInt i = 0; i < num_output_fields; i++) { + const char *field_name; + std::string var_suffix = "_out_" + std::to_string(i); + + CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name)); + code << tab << "// ---- Output field " << i << ": " << field_name << "\n"; + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); + // Basis action + switch (eval_mode) { + case CEED_EVAL_NONE: + code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n"; + break; + case CEED_EVAL_INTERP: + code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n"; + break; + case CEED_EVAL_GRAD: + code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n"; + break; + // LCOV_EXCL_START + case CEED_EVAL_WEIGHT: + break; // Should not occur + case CEED_EVAL_DIV: case CEED_EVAL_CURL: break; // TODO: Not implemented + // LCOV_EXCL_STOP } } } else { - code << "\n // Note: Using full elements\n"; - code << " // -- Input fields --\n"; + code << "\n"; + code << tab << "// Note: Using full elements\n"; + code << tab << "{\n"; + tab.push(); + code << tab << "// -- Input fields\n"; for (CeedInt i = 0; i < num_input_fields; i++) { - code << " // ---- Input field " << i << " ----\n"; - code << " CeedScalar* r_q_" << i << " = r_t_" << i << ";\n"; + const char *field_name; + + CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name)); + code << tab << "// ---- Input field " << i << ": " << field_name << "\n"; + code << tab << "CeedScalar *r_s_in_" << i << " = r_q_in_" << i << ";\n"; } - code << " // -- Output fields --\n"; + code << tab << "// -- Output fields\n"; for (CeedInt i = 0; i < num_output_fields; i++) { - code << " // ---- Output field " << i << " ----\n"; - code << " CeedScalar* r_qq_" << i << " = r_tt_" << i << ";\n"; + const char *field_name; + + CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name)); + code << tab << "// ---- Output field " << i << ": " << field_name << "\n"; + code << tab << "CeedScalar *r_s_out_" << i << " = r_q_out_" << i << ";\n"; } } - code << "\n // -- QFunction Inputs and outputs --\n"; - code << " CeedScalar* in[" << num_input_fields << "];\n"; + + // Input and output buffers + code << "\n"; + code << tab << "// -- QFunction inputs and outputs\n"; + code << tab << "// ---- Inputs\n"; + code << tab << "CeedScalar *inputs[" << CeedIntMax(num_input_fields, 1) << "];\n"; for (CeedInt i = 0; i < num_input_fields; i++) { - code << " // ---- Input field " << i << " ----\n"; - code << " in[" << i << "] = r_q_" << i << ";\n"; + const char *field_name; + + CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name)); + code << tab << "// ------ Input field " << i << ": " << field_name << "\n"; + code << tab << "inputs[" << i << "] = r_s_in_" << i << ";\n"; } - code << " CeedScalar* out[" << num_output_fields << "];\n"; + code << tab << "// ---- Outputs\n"; + code << tab << "CeedScalar *outputs[" << CeedIntMax(num_output_fields, 1) << "];\n"; for (CeedInt i = 0; i < num_output_fields; i++) { - code << " // ---- Output field " << i << " ----\n"; - code << " out[" << i << "] = r_qq_" << i << ";\n"; + const char *field_name; + + CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name)); + code << tab << "// ------ Output field " << i << ": " << field_name << "\n"; + code << tab << "outputs[" << i << "] = r_s_out_" << i << ";\n"; } - code << "\n // -- Apply QFunction --\n"; - code << " " << qfunction_name << "(ctx, "; - if (dim != 3 || use_collograd_parallelization) { + + // Apply QFunction + code << "\n"; + code << tab << "// -- Apply QFunction\n"; + code << tab << "" << qfunction_name << "(ctx, "; + if (max_dim != 3 || is_at_points || use_3d_slices || !is_all_tensor) { code << "1"; } else { - code << "Q_1d"; + code << Q_name; } - code << ", in, out);\n"; - if (use_collograd_parallelization) { - code << " // -- Output fields --\n"; + code << ", inputs, outputs);\n"; + + if (is_at_points) { + // Map back to coefficients + code << "\n"; + code << tab << "// -- Output fields\n"; for (CeedInt i = 0; i < num_output_fields; i++) { - code << " // ---- Output field " << i << " ----\n"; + const char *field_name; + std::string var_suffix = "_out_" + std::to_string(i); + std::string P_name = "P_1d" + var_suffix; + + CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name)); + code << tab << "// ---- Output field " << i << ": " << field_name << "\n"; CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); // Basis action - code << " // EvalMode: " << CeedEvalModes[eval_mode] << "\n"; + code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n"; switch (eval_mode) { - case CEED_EVAL_NONE: - code << " for (CeedInt j = 0; j < num_comp_out_" << i << " ; ++j) {\n"; - code << " r_tt_" << i << "[q + j*Q_1d] = r_qq_" << i << "[j];\n"; - code << " }\n"; - break; // No action + case CEED_EVAL_NONE: { + CeedInt comp_stride; + CeedElemRestriction elem_rstr; + + if (is_assemble) break; + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr)); + CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride)); + CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); + code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n"; + code << tab << "WritePoint(data, elem, i, points.num_per_elem[elem], indices.outputs[" << i << "]" + << ", r_s" << var_suffix << ", d" << var_suffix << ");\n"; + break; + } case CEED_EVAL_INTERP: - code << " for (CeedInt j = 0; j < num_comp_out_" << i << " ; ++j) {\n"; - code << " r_tt_" << i << "[q + j*Q_1d] = r_qq_" << i << "[j];\n"; - code << " }\n"; + code << tab << "if (i >= points.num_per_elem[elem]) {\n"; + tab.push(); + code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << "; j++) r_s" << var_suffix << "[j] = 0.0;\n"; + tab.pop(); + code << tab << "}\n"; + code << tab << "InterpTransposeAtPoints" << max_dim << "d(data, i, r_s" << var_suffix << ", r_x, r_c" << var_suffix << ");\n"; break; case CEED_EVAL_GRAD: - code << " gradColloTranspose3d(data, q, r_qq_" << i << ", s_G_out_" << i << ", r_tt_" << i << ");\n"; + code << tab << "if (i >= points.num_per_elem[elem]) {\n"; + tab.push(); + code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << "*dim" << var_suffix << "; j++) r_s" << var_suffix << "[j] = 0.0;\n"; + tab.pop(); + code << tab << "}\n"; + code << tab << "GradTransposeAtPoints" << max_dim << "d(data, i, r_s" << var_suffix << ", r_x, r_c" << var_suffix << ");\n"; break; + // LCOV_EXCL_START case CEED_EVAL_WEIGHT: break; // Should not occur case CEED_EVAL_DIV: + case CEED_EVAL_CURL: break; // TODO: Not implemented + // LCOV_EXCL_STOP + } + } + } else if (use_3d_slices) { + // Copy or apply transpose grad, if needed + code << "\n"; + code << tab << "// -- Output fields\n"; + for (CeedInt i = 0; i < num_output_fields; i++) { + const char *field_name; + std::string var_suffix = "_out_" + std::to_string(i); + std::string P_name = "P_1d" + var_suffix; + + CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name)); + code << tab << "// ---- Output field " << i << ": " << field_name << "\n"; + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); + // Basis action + code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n"; + switch (eval_mode) { + case CEED_EVAL_NONE: + code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << " ; j++) {\n"; + tab.push(); + code << tab << "r_q" << var_suffix << "[q + j*" << Q_name << "] = r_s" << var_suffix << "[j];\n"; + tab.pop(); + code << tab << "}\n"; + break; + case CEED_EVAL_INTERP: + code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << " ; j++) {\n"; + tab.push(); + code << tab << "r_q" << var_suffix << "[q + j*" << Q_name << "] = r_s" << var_suffix << "[j];\n"; + tab.pop(); + code << tab << "}\n"; + break; + case CEED_EVAL_GRAD: + code << tab << "GradColloSliceTranspose3d(data, q, r_s" << var_suffix << ", s_G" + << var_suffix << ", r_q" << var_suffix << ");\n"; + break; + // LCOV_EXCL_START + case CEED_EVAL_WEIGHT: + break; // Should not occur + case CEED_EVAL_DIV: case CEED_EVAL_CURL: break; // TODO: Not implemented + // LCOV_EXCL_STOP } } - code << " }\n"; } + tab.pop(); + code << tab << "}\n"; + return CEED_ERROR_SUCCESS; +} - // Output basis apply if needed - // Generate the correct eval mode code for each output - code << "\n // -- Output field basis action and restrictions --\n"; - for (CeedInt i = 0; i < num_output_fields; i++) { - code << " // ---- Output field " << i << " ----\n"; - // Get elem_size, eval_mode, num_comp - CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr)); - CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); - CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); - CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); - // TODO put in a function - // Basis action - code << " // EvalMode: " << CeedEvalModes[eval_mode] << "\n"; - switch (eval_mode) { - case CEED_EVAL_NONE: - code << " CeedScalar* r_v_" << i << " = r_tt_" << i << ";\n"; - break; // No action - case CEED_EVAL_INTERP: - code << " CeedScalar r_v_" << i << "[num_comp_out_" << i << "*P_out_" << i << "];\n"; - code << " InterpTranspose" << (dim > 1 ? "Tensor" : "") << dim << "d(data, r_tt_" << i - << ", s_B_out_" << i << ", r_v_" << i << ");\n"; - break; - case CEED_EVAL_GRAD: - code << " CeedScalar r_v_" << i << "[num_comp_out_" << i << "*P_out_" << i << "];\n"; - if (use_collograd_parallelization) { - code << " InterpTranspose" << (dim > 1 ? "Tensor" : "") << dim << "d(data, r_tt_" << i - << ", s_B_out_" << i << ", r_v_" << i << ");\n"; - } else { - CeedInt P_1d; - CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); - CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); - code << " GradTranspose" << (dim > 1 ? "Tensor" : "") << (dim == 3 && Q_1d >= P_1d ? "Collocated" : "") << dim << "d(data, r_tt_" << i << ", s_B_out_" << i << ", s_G_out_" << i << ", r_v_" << i << ");\n"; - } - break; - // LCOV_EXCL_START - case CEED_EVAL_WEIGHT: { - return CeedError(CeedOperatorReturnCeed(op), CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); - break; // Should not occur - } - case CEED_EVAL_DIV: - case CEED_EVAL_CURL: { - return CeedError(CeedOperatorReturnCeed(op), CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]); - break; // Should not occur +//------------------------------------------------------------------------------ +// Build single operator kernel +//------------------------------------------------------------------------------ +extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_build) { + bool is_all_tensor = true, is_all_nontensor = true, is_at_points = false, use_3d_slices = false; + Ceed ceed; + CeedInt Q = 0, Q_1d = 0, num_input_fields, num_output_fields, max_dim = 1, max_num_points = 0, coords_comp_stride = 0; + CeedQFunctionField *qf_input_fields, *qf_output_fields; + CeedQFunction_Cuda_gen *qf_data; + CeedQFunction qf; + CeedOperatorField *op_input_fields, *op_output_fields; + CeedOperator_Cuda_gen *data; + std::ostringstream code; + Tab tab; + + CeedCallBackend(CeedOperatorGetData(op, &data)); + { + bool is_setup_done; + + CeedCallBackend(CeedOperatorIsSetupDone(op, &is_setup_done)); + if (is_setup_done) { + *is_good_build = !data->use_fallback; + return CEED_ERROR_SUCCESS; + } + } + + // Check field compatibility + CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); + { + bool has_shared_bases = true; + + for (CeedInt i = 0; i < num_input_fields; i++) { + CeedBasis basis; + + CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); + if (basis != CEED_BASIS_NONE) { + bool is_tensor = true; + const char *resource; + char *resource_root; + Ceed basis_ceed; + + CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor)); + is_all_tensor = is_all_tensor && is_tensor; + is_all_nontensor = is_all_nontensor && !is_tensor; + CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed)); + CeedCallBackend(CeedGetResource(basis_ceed, &resource)); + CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root)); + has_shared_bases = has_shared_bases && !strcmp(resource_root, "/gpu/cuda/shared"); + CeedCallBackend(CeedFree(&resource_root)); + CeedCallBackend(CeedDestroy(&basis_ceed)); } - // LCOV_EXCL_STOP + CeedCallBackend(CeedBasisDestroy(&basis)); } - // TODO put in a function - // Restriction - bool is_strided; - CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided)); - if (!is_strided) { - CeedInt comp_stride; - CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size)); - code << " const CeedInt l_size_out_" << i << " = " << l_size << ";\n"; - CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride)); - code << " // CompStride: " << comp_stride << "\n"; - CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_data)); - data->indices.outputs[i] = (CeedInt *)rstr_data->d_offsets; - code << " writeDofsOffset" << dim << "d(data, l_size_out_" << i - << ", elem, indices.outputs[" << i << "], r_v_" << i << ", d_v_" << i << ");\n"; - } else { - bool has_backend_strides; - CeedInt num_elem; + for (CeedInt i = 0; i < num_output_fields; i++) { + CeedBasis basis; + + CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); + if (basis != CEED_BASIS_NONE) { + bool is_tensor = true; + const char *resource; + char *resource_root; + Ceed basis_ceed; - CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides)); - CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem)); - CeedInt strides[3] = {1, elem_size * num_elem, elem_size}; + CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor)); + is_all_tensor = is_all_tensor && is_tensor; + is_all_nontensor = is_all_nontensor && !is_tensor; - if (!has_backend_strides) { - CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides)); + CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed)); + CeedCallBackend(CeedGetResource(basis_ceed, &resource)); + CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root)); + has_shared_bases = has_shared_bases && !strcmp(resource_root, "/gpu/cuda/shared"); + CeedCallBackend(CeedFree(&resource_root)); + CeedCallBackend(CeedDestroy(&basis_ceed)); } - code << " // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n"; - code << " writeDofsStrided" << dim << "d(data, elem, r_v_" << i << ", d_v_" << i << ");\n"; + CeedCallBackend(CeedBasisDestroy(&basis)); + } + // -- Fallback to ref if not all bases are shared + if (!has_shared_bases) { + *is_good_build = false; + return CEED_ERROR_SUCCESS; } } + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); + CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); + CeedCallBackend(CeedQFunctionGetData(qf, &qf_data)); + CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); + + // Get operator data + CeedCallBackend(CeedOperatorIsAtPoints(op, &is_at_points)); + { + CeedInt max_P = 0, max_P_1d = 0; + + CeedCallBackend(CeedOperatorBuildKernelData_Cuda_gen(ceed, num_input_fields, op_input_fields, qf_input_fields, num_output_fields, + op_output_fields, qf_output_fields, &max_P, &max_P_1d, &Q, &Q_1d, &max_dim, &is_all_tensor, + &use_3d_slices)); + data->max_P_1d = is_all_tensor ? max_P_1d : max_P; + } + if (is_at_points) { + CeedInt coords_dim = 0; + CeedElemRestriction_Cuda *rstr_data; + CeedElemRestriction rstr_points = NULL; - code << " }\n"; - code << "}\n"; - code << "// -----------------------------------------------------------------------------\n\n"; + CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL)); + CeedCallBackend(CeedElemRestrictionGetMaxPointsInElement(rstr_points, &max_num_points)); + CeedCallBackend(CeedElemRestrictionGetCompStride(rstr_points, &coords_comp_stride)); + CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr_points, &coords_dim)); + CeedCallBackend(CeedElemRestrictionGetData(rstr_points, &rstr_data)); + data->points.indices = (CeedInt *)rstr_data->d_offsets; + CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points)); + if (max_dim == 0) max_dim = coords_dim; + if (Q_1d == 0) max_num_points = ceil(pow(max_num_points, 1.0 / max_dim)); + } + if (max_dim == 0) max_dim = 1; + data->dim = max_dim; + if (is_at_points) use_3d_slices = false; + if (Q_1d == 0) { + if (is_at_points) Q_1d = max_num_points; + else CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q_1d)); + } + if (Q == 0) Q = Q_1d; + data->Q = Q; + data->Q_1d = Q_1d; - // View kernel for debugging - CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "Generated Operator Kernels:\n"); - CeedDebug(ceed, code.str().c_str()); + // Check for restriction only identity operator + { + bool is_identity_qf; - CeedCallBackend(CeedCompile_Cuda(ceed, code.str().c_str(), &data->module, 1, "T_1D", CeedIntMax(Q_1d, data->max_P_1d))); - CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, operator_name.c_str(), &data->op)); + CeedCallBackend(CeedQFunctionIsIdentity(qf, &is_identity_qf)); + if (is_identity_qf) { + CeedEvalMode eval_mode_in, eval_mode_out; - CeedCallBackend(CeedOperatorSetSetupDone(op)); + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[0], &eval_mode_in)); + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[0], &eval_mode_out)); + CeedCheck(eval_mode_in != CEED_EVAL_NONE || eval_mode_out != CEED_EVAL_NONE, ceed, CEED_ERROR_BACKEND, + "Backend does not implement restriction only identity operators"); + } + } + + // Add atomicAdd function for old NVidia architectures + { + Ceed_Cuda *ceed_data; + struct cudaDeviceProp prop; + + CeedCallBackend(CeedGetData(ceed, &ceed_data)); + CeedCallBackend(cudaGetDeviceProperties(&prop, ceed_data->device_id)); + if ((prop.major < 6) && (CEED_SCALAR_TYPE != CEED_SCALAR_FP32)) { + code << tab << "// AtomicAdd fallback source\n"; + code << tab << "#include \n\n"; + } + } + + // Load basis source files + if (!is_all_nontensor) { + code << tab << "// Tensor basis source\n"; + code << tab << "#include \n\n"; + } + if (!is_all_tensor) { + code << tab << "// Non-tensor basis source\n"; + code << tab << "#include \n\n"; + } + if (!is_all_tensor && !is_all_nontensor) { + code << "// Tensor basis source\n"; + code << "#include \n\n"; + } + if (is_at_points) { + code << "// AtPoints basis source\n"; + code << "#include \n\n"; + } + code << "// CodeGen operator source\n"; + code << "#include \n\n"; + + // Get QFunction name + std::string qfunction_name(qf_data->qfunction_name); + std::string operator_name; + + operator_name = "CeedKernelCudaGenOperator_" + qfunction_name; + + // Define CEED_Q_VLA + code << "\n" << tab << "#undef CEED_Q_VLA\n"; + if (max_dim != 3 || is_at_points || use_3d_slices || !is_all_tensor) { + code << tab << "#define CEED_Q_VLA 1\n\n"; + } else { + code << tab << "#define CEED_Q_VLA " << Q_1d << "\n\n"; + } + + // Add user QFunction source + { + const char *source_path; + + CeedCallBackend(CeedQFunctionGetSourcePath(qf, &source_path)); + CeedCheck(source_path, ceed, CEED_ERROR_UNSUPPORTED, "/gpu/cuda/gen backend requires QFunction source code file"); + + code << tab << "// User QFunction source\n"; + code << tab << "#include \"" << source_path << "\"\n\n"; + } + + // Setup + code << "\n" << tab << "// -----------------------------------------------------------------------------\n"; + code << tab << "// Operator Kernel\n"; + code << tab << "// \n"; + code << tab << "// d_[in,out]_i: CeedVector device array\n"; + code << tab << "// r_[in,out]_e_i: Element vector register\n"; + code << tab << "// r_[in,out]_q_i: Quadrature space vector register\n"; + code << tab << "// r_[in,out]_c_i: AtPoints Chebyshev coefficients register\n"; + code << tab << "// r_[in,out]_s_i: Quadrature space slice vector register\n"; + code << tab << "// \n"; + code << tab << "// s_B_[in,out]_i: Interpolation matrix, shared memory\n"; + code << tab << "// s_G_[in,out]_i: Gradient matrix, shared memory\n"; + code << tab << "// -----------------------------------------------------------------------------\n"; + code << tab << "extern \"C\" __global__ void " << operator_name + << "(CeedInt num_elem, void* ctx, FieldsInt_Cuda indices, Fields_Cuda fields, Fields_Cuda B, Fields_Cuda G, CeedScalar *W, Points_Cuda " + "points) {\n"; + tab.push(); + + // Scratch buffers + for (CeedInt i = 0; i < num_input_fields; i++) { + CeedEvalMode eval_mode; + + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); + if (eval_mode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT + code << tab << "const CeedScalar *__restrict__ d_in_" << i << " = fields.inputs[" << i << "];\n"; + } + } + for (CeedInt i = 0; i < num_output_fields; i++) { + code << tab << "CeedScalar *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n"; + } + + code << tab << "const CeedInt max_dim = " << max_dim << ";\n"; + if (!is_all_tensor) { + code << tab << "const CeedInt Q = " << Q << ";\n"; + } + if (!is_all_nontensor) { + code << tab << "const CeedInt Q_1d = " << Q_1d << ";\n"; + } + if (is_at_points) { + code << tab << "const CeedInt max_num_points = " << max_num_points << ";\n"; + code << tab << "const CeedInt coords_comp_stride = " << coords_comp_stride << ";\n"; + } + + // Shared data + code << tab << "extern __shared__ CeedScalar slice[];\n"; + code << tab << "SharedData_Cuda data;\n"; + code << tab << "data.t_id_x = threadIdx.x;\n"; + code << tab << "data.t_id_y = threadIdx.y;\n"; + code << tab << "data.t_id_z = threadIdx.z;\n"; + code << tab << "data.t_id = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n"; + code << tab << "data.slice = slice + data.t_id_z*OP_T_1D" << ((!is_all_tensor || max_dim == 1) ? "" : "*OP_T_1D") << ";\n"; + + // -- Determine input mat reuse + FieldReuse_Cuda input_matrix_reuse[CEED_FIELD_MAX]; + + for (CeedInt i = 0; i < num_input_fields; i++) { + input_matrix_reuse[i].index = -1; + } + for (CeedInt i = 0; i < num_input_fields; i++) { + bool is_tensor = true; + CeedEvalMode eval_mode_i; + CeedBasis basis_i; + + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode_i)); + if (eval_mode_i == CEED_EVAL_WEIGHT) continue; + CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis_i)); + CeedCallBackend(CeedBasisIsTensor(basis_i, &is_tensor)); + for (CeedInt j = 0; (input_matrix_reuse[i].index == -1) && (j < i); j++) { + CeedEvalMode eval_mode_j; + CeedBasis basis_j; + + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j)); + if (eval_mode_j == CEED_EVAL_WEIGHT) continue; + CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j)); + if (basis_i == basis_j) { + if (is_tensor) { + input_matrix_reuse[i].index = j; + input_matrix_reuse[i].is_input = true; + input_matrix_reuse[i].eval_mode = eval_mode_j; + } else { + // For non-tensor can only re-use with the same eval mode + if (eval_mode_i == eval_mode_j) { + input_matrix_reuse[i].index = j; + input_matrix_reuse[i].is_input = true; + input_matrix_reuse[i].eval_mode = eval_mode_j; + } + } + } + CeedCallBackend(CeedBasisDestroy(&basis_j)); + } + CeedCallBackend(CeedBasisDestroy(&basis_i)); + } + + // -- Determine output mat reuse + FieldReuse_Cuda output_matrix_reuse[CEED_FIELD_MAX]; + + for (CeedInt i = 0; i < num_output_fields; i++) { + output_matrix_reuse[i].index = -1; + } + for (CeedInt i = 0; i < num_output_fields; i++) { + bool is_tensor = true; + CeedEvalMode eval_mode_i; + CeedBasis basis_i; + + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode_i)); + CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis_i)); + CeedCallBackend(CeedBasisIsTensor(basis_i, &is_tensor)); + for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < num_input_fields); j++) { + CeedEvalMode eval_mode_j; + CeedBasis basis_j; + + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j)); + if (eval_mode_j == CEED_EVAL_WEIGHT) continue; + CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j)); + if (basis_i == basis_j) { + if (is_tensor) { + output_matrix_reuse[i].index = j; + output_matrix_reuse[i].is_input = true; + output_matrix_reuse[i].eval_mode = eval_mode_j; + } else { + // For non-tensor can only re-use with the same eval mode + if (eval_mode_i == eval_mode_j) { + output_matrix_reuse[i].index = j; + output_matrix_reuse[i].is_input = true; + output_matrix_reuse[i].eval_mode = eval_mode_j; + } + } + } + CeedCallBackend(CeedBasisDestroy(&basis_j)); + } + for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < i); j++) { + CeedEvalMode eval_mode_j; + CeedBasis basis_j; + + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[j], &eval_mode_j)); + if (eval_mode_j == CEED_EVAL_WEIGHT) continue; + CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[j], &basis_j)); + if (basis_i == basis_j) { + if (is_tensor) { + output_matrix_reuse[i].index = j; + output_matrix_reuse[i].is_input = false; + output_matrix_reuse[i].eval_mode = eval_mode_j; + } else { + // For non-tensor can only re-use with the same eval mode + if (eval_mode_i == eval_mode_j) { + output_matrix_reuse[i].index = j; + output_matrix_reuse[i].is_input = false; + output_matrix_reuse[i].eval_mode = eval_mode_j; + } + } + } + CeedCallBackend(CeedBasisDestroy(&basis_j)); + } + CeedCallBackend(CeedBasisDestroy(&basis_i)); + } + + // Initialize constants, and matrices B and G + code << "\n" << tab << "// Input field constants and basis data\n"; + for (CeedInt i = 0; i < num_input_fields; i++) { + CeedCallBackend(CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, tab, i, op_input_fields[i], qf_input_fields[i], input_matrix_reuse[i], + max_dim, Q, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices, false)); + } + code << "\n" << tab << "// Output field constants and basis data\n"; + for (CeedInt i = 0; i < num_output_fields; i++) { + CeedCallBackend(CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], output_matrix_reuse[i], + max_dim, Q, Q_1d, false, is_all_tensor, is_at_points, use_3d_slices, false)); + } + + // Loop over all elements + code << "\n" << tab << "// Element loop\n"; + code << tab << "__syncthreads();\n"; + code << tab << "for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {\n"; + tab.push(); + + // -- Compute minimum buffer space needed + CeedInt max_rstr_buffer_size = 1; + + for (CeedInt i = 0; i < num_input_fields; i++) { + CeedEvalMode eval_mode; + + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); + if (eval_mode != CEED_EVAL_NONE && eval_mode != CEED_EVAL_WEIGHT) { + CeedInt num_comp; + CeedElemRestriction elem_rstr; + + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr)); + CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); + max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1)); + CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); + } + } + for (CeedInt i = 0; i < num_output_fields; i++) { + CeedEvalMode eval_mode; + + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); + if (eval_mode != CEED_EVAL_NONE) { + CeedInt num_comp; + CeedElemRestriction elem_rstr; + + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr)); + CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); + max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1)); + CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); + } + } + code << tab << "// Scratch restriction buffer space\n"; + code << tab << "CeedScalar r_e_scratch[" << max_rstr_buffer_size << "];\n"; + + // -- Determine best input field processing order + CeedInt field_rstr_in_buffer[CEED_FIELD_MAX], input_field_order[CEED_FIELD_MAX]; + + for (CeedInt i = 0; i < num_input_fields; i++) { + field_rstr_in_buffer[i] = -1; + input_field_order[i] = -1; + } + { + bool is_ordered[CEED_FIELD_MAX]; + CeedInt curr_index = 0; + + for (CeedInt i = 0; i < num_input_fields; i++) is_ordered[i] = false; + for (CeedInt i = 0; i < num_input_fields; i++) { + CeedVector vec_i; + CeedElemRestriction rstr_i; + + if (is_ordered[i]) continue; + field_rstr_in_buffer[i] = i; + is_ordered[i] = true; + input_field_order[curr_index] = i; + curr_index++; + CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i)); + if (vec_i == CEED_VECTOR_NONE) continue; // CEED_EVAL_WEIGHT + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr_i)); + for (CeedInt j = i + 1; j < num_input_fields; j++) { + CeedVector vec_j; + CeedElemRestriction rstr_j; + + CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[j], &vec_j)); + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[j], &rstr_j)); + if (rstr_i == rstr_j && vec_i == vec_j) { + field_rstr_in_buffer[j] = i; + is_ordered[j] = true; + input_field_order[curr_index] = j; + curr_index++; + } + CeedCallBackend(CeedVectorDestroy(&vec_j)); + CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j)); + } + CeedCallBackend(CeedVectorDestroy(&vec_i)); + CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i)); + } + } + + // -- Input restriction and basis + code << "\n" << tab << "// -- Input field restrictions and basis actions\n"; + for (CeedInt i = 0; i < num_input_fields; i++) { + const char *field_name; + const CeedInt f = input_field_order[i]; + + CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[f], &field_name)); + code << tab << "// ---- Input field " << f << ": " << field_name << "\n"; + + // ---- Restriction + CeedCallBackend(CeedOperatorBuildKernelRestriction_Cuda_gen(code, data, tab, f, field_rstr_in_buffer, op_input_fields[f], qf_input_fields[f], + max_dim, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices)); + + // ---- Basis action + CeedCallBackend(CeedOperatorBuildKernelBasis_Cuda_gen(code, data, tab, f, op_input_fields[f], qf_input_fields[f], max_dim, Q_1d, true, + is_all_tensor, is_at_points, use_3d_slices)); + } + + // -- Q function + CeedCallBackend(CeedOperatorBuildKernelQFunction_Cuda_gen(code, data, tab, max_dim, max_num_points, num_input_fields, op_input_fields, + qf_input_fields, num_output_fields, op_output_fields, qf_output_fields, qfunction_name, + Q_1d, is_all_tensor, is_at_points, use_3d_slices, false)); + + // -- Output basis and restriction + code << "\n" << tab << "// -- Output field basis action and restrictions\n"; + for (CeedInt i = 0; i < num_output_fields; i++) { + const char *field_name; + + CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name)); + code << tab << "// ---- Output field " << i << ": " << field_name << "\n"; + + // ---- Basis action + CeedCallBackend(CeedOperatorBuildKernelBasis_Cuda_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], max_dim, Q_1d, false, + is_all_tensor, is_at_points, use_3d_slices)); + + // ---- Restriction + CeedCallBackend(CeedOperatorBuildKernelRestriction_Cuda_gen(code, data, tab, i, NULL, op_output_fields[i], qf_output_fields[i], max_dim, Q_1d, + false, is_all_tensor, is_at_points, use_3d_slices)); + } + + // Close loop and function + tab.pop(); + code << tab << "}\n"; + tab.pop(); + code << tab << "}\n"; + code << tab << "// -----------------------------------------------------------------------------\n\n"; + + // Compile + { + bool is_compile_good = false; + const CeedInt T_1d = CeedIntMax(is_all_tensor ? Q_1d : Q, data->max_P_1d); + + data->thread_1d = T_1d; + CeedCallBackend(CeedTryCompile_Cuda(ceed, code.str().c_str(), &is_compile_good, &data->module, 1, "OP_T_1D", T_1d)); + if (is_compile_good) { + *is_good_build = true; + CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, operator_name.c_str(), &data->op)); + } else { + *is_good_build = false; + data->use_fallback = true; + } + } + CeedCallBackend(CeedOperatorSetSetupDone(op)); + CeedCallBackend(CeedDestroy(&ceed)); + CeedCallBackend(CeedQFunctionDestroy(&qf)); + return CEED_ERROR_SUCCESS; +} + +//------------------------------------------------------------------------------ +// Build AtPoints assembly operator kernel +//------------------------------------------------------------------------------ +static int CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen(CeedOperator op, bool is_full, bool *is_good_build) { + bool is_all_tensor = true, is_at_points = false, use_3d_slices = false; + Ceed ceed; + CeedInt Q, Q_1d, num_input_fields, num_output_fields, max_dim = 1, max_num_points = 0, coords_comp_stride = 0; + CeedQFunctionField *qf_input_fields, *qf_output_fields; + CeedQFunction_Cuda_gen *qf_data; + CeedQFunction qf; + CeedOperatorField *op_input_fields, *op_output_fields; + CeedOperator_Cuda_gen *data; + std::ostringstream code; + Tab tab; + + // Check compatibility + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); + CeedCallBackend(CeedOperatorIsAtPoints(op, &is_at_points)); + CeedCheck(is_at_points, ceed, CEED_ERROR_BACKEND, "Only AtPoints operator assembly supported"); + + // Retrieve operator data + CeedCallBackend(CeedOperatorGetData(op, &data)); + Q = data->Q; + Q_1d = data->Q_1d; + max_dim = data->dim; + { + CeedElemRestriction rstr_points = NULL; + + CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL)); + CeedCallBackend(CeedElemRestrictionGetMaxPointsInElement(rstr_points, &max_num_points)); + CeedCallBackend(CeedElemRestrictionGetCompStride(rstr_points, &coords_comp_stride)); + CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points)); + } + CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); + CeedCallBackend(CeedQFunctionGetData(qf, &qf_data)); + CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); + CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); + + // Add atomicAdd function for old NVidia architectures + { + Ceed_Cuda *ceed_data; + struct cudaDeviceProp prop; + + CeedCallBackend(CeedGetData(ceed, &ceed_data)); + CeedCallBackend(cudaGetDeviceProperties(&prop, ceed_data->device_id)); + if ((prop.major < 6) && (CEED_SCALAR_TYPE != CEED_SCALAR_FP32)) { + code << tab << "// AtomicAdd fallback source\n"; + code << tab << "#include \n\n"; + } + } + + // Load basis source files + code << tab << "// Tensor basis source\n"; + code << tab << "#include \n\n"; + code << tab << "// AtPoints basis source\n"; + code << tab << "#include \n\n"; + code << tab << "// CodeGen operator source\n"; + code << tab << "#include \n\n"; + + // Get QFunction name + std::string qfunction_name(qf_data->qfunction_name); + std::string operator_name; + + if (is_full) { + operator_name = "CeedKernelCudaGenOperatorFullAssembly_" + qfunction_name; + } else { + operator_name = "CeedKernelCudaGenOperatorDiagonalAssembly_" + qfunction_name; + } + + // Define CEED_Q_VLA + code << "\n" << tab << "#undef CEED_Q_VLA\n"; + code << tab << "#define CEED_Q_VLA 1\n\n"; + + // Add user QFunction source + { + const char *source_path; + + CeedCallBackend(CeedQFunctionGetSourcePath(qf, &source_path)); + CeedCheck(source_path, ceed, CEED_ERROR_UNSUPPORTED, "/gpu/cuda/gen backend requires QFunction source code file"); + + code << tab << "// User QFunction source\n"; + code << tab << "#include \"" << source_path << "\"\n\n"; + } + + // Setup + code << "\n" << tab << "// -----------------------------------------------------------------------------\n"; + code << tab << "// Operator Assembly Kernel\n"; + code << tab << "// \n"; + code << tab << "// d_[in,out]_i: CeedVector device array\n"; + code << tab << "// r_[in,out]_e_i: Element vector register\n"; + code << tab << "// r_[in,out]_q_i: Quadrature space vector register\n"; + code << tab << "// r_[in,out]_c_i: AtPoints Chebyshev coefficients register\n"; + code << tab << "// r_[in,out]_s_i: Quadrature space slice vector register\n"; + code << tab << "// \n"; + code << tab << "// s_B_[in,out]_i: Interpolation matrix, shared memory\n"; + code << tab << "// s_G_[in,out]_i: Gradient matrix, shared memory\n"; + code << tab << "// -----------------------------------------------------------------------------\n"; + code << tab << "extern \"C\" __global__ void " << operator_name + << "(CeedInt num_elem, void* ctx, FieldsInt_Cuda indices, Fields_Cuda fields, Fields_Cuda B, Fields_Cuda G, CeedScalar *W, Points_Cuda " + "points, CeedScalar *__restrict__ values_array) {\n"; + tab.push(); + + // Scratch buffers + for (CeedInt i = 0; i < num_input_fields; i++) { + CeedEvalMode eval_mode; + + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); + if (eval_mode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT + code << tab << "const CeedScalar *__restrict__ d_in_" << i << " = fields.inputs[" << i << "];\n"; + } + } + for (CeedInt i = 0; i < num_output_fields; i++) { + code << tab << "CeedScalar *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n"; + } + + code << tab << "const CeedInt max_dim = " << max_dim << ";\n"; + code << tab << "const CeedInt Q_1d = " << Q_1d << ";\n"; + code << tab << "const CeedInt max_num_points = " << max_num_points << ";\n"; + code << tab << "const CeedInt coords_comp_stride = " << coords_comp_stride << ";\n"; + + // Shared data + code << tab << "extern __shared__ CeedScalar slice[];\n"; + code << tab << "SharedData_Cuda data;\n"; + code << tab << "data.t_id_x = threadIdx.x;\n"; + code << tab << "data.t_id_y = threadIdx.y;\n"; + code << tab << "data.t_id_z = threadIdx.z;\n"; + code << tab << "data.t_id = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n"; + code << tab << "data.slice = slice + data.t_id_z*OP_T_1D" << ((!is_all_tensor || max_dim == 1) ? "" : "*OP_T_1D") << ";\n"; + + // -- Determine input mat reuse + FieldReuse_Cuda input_matrix_reuse[CEED_FIELD_MAX]; + + for (CeedInt i = 0; i < num_input_fields; i++) { + input_matrix_reuse[i].index = -1; + } + for (CeedInt i = 0; i < num_input_fields; i++) { + CeedEvalMode eval_mode_i; + CeedBasis basis_i; + + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode_i)); + if (eval_mode_i == CEED_EVAL_WEIGHT) continue; + CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis_i)); + for (CeedInt j = 0; (input_matrix_reuse[i].index == -1) && (j < i); j++) { + CeedEvalMode eval_mode_j; + CeedBasis basis_j; + + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j)); + if (eval_mode_j == CEED_EVAL_WEIGHT) continue; + CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j)); + if (basis_i == basis_j) { + input_matrix_reuse[i].index = j; + input_matrix_reuse[i].is_input = true; + input_matrix_reuse[i].eval_mode = eval_mode_j; + } + CeedCallBackend(CeedBasisDestroy(&basis_j)); + } + CeedCallBackend(CeedBasisDestroy(&basis_i)); + } + + // -- Determine output mat reuse + FieldReuse_Cuda output_matrix_reuse[CEED_FIELD_MAX]; + + for (CeedInt i = 0; i < num_output_fields; i++) { + output_matrix_reuse[i].index = -1; + } + for (CeedInt i = 0; i < num_output_fields; i++) { + CeedEvalMode eval_mode_i; + CeedBasis basis_i; + + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode_i)); + CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis_i)); + for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < num_input_fields); j++) { + CeedEvalMode eval_mode_j; + CeedBasis basis_j; + + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j)); + if (eval_mode_j == CEED_EVAL_WEIGHT) continue; + CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j)); + if (basis_i == basis_j) { + output_matrix_reuse[i].index = j; + output_matrix_reuse[i].is_input = true; + output_matrix_reuse[i].eval_mode = eval_mode_j; + } + CeedCallBackend(CeedBasisDestroy(&basis_j)); + } + for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < i); j++) { + CeedEvalMode eval_mode_j; + CeedBasis basis_j; + + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[j], &eval_mode_j)); + if (eval_mode_j == CEED_EVAL_WEIGHT) continue; + CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[j], &basis_j)); + if (basis_i == basis_j) { + output_matrix_reuse[i].index = j; + output_matrix_reuse[i].is_input = false; + output_matrix_reuse[i].eval_mode = eval_mode_j; + } + CeedCallBackend(CeedBasisDestroy(&basis_j)); + } + CeedCallBackend(CeedBasisDestroy(&basis_i)); + } + + // Initialize constants, and matrices B and G + code << "\n" << tab << "// Input field constants and basis data\n"; + for (CeedInt i = 0; i < num_input_fields; i++) { + CeedCallBackend(CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, tab, i, op_input_fields[i], qf_input_fields[i], input_matrix_reuse[i], + max_dim, Q, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices, false)); + } + code << "\n" << tab << "// Output field constants and basis data\n"; + for (CeedInt i = 0; i < num_output_fields; i++) { + CeedCallBackend(CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], output_matrix_reuse[i], + max_dim, Q, Q_1d, false, is_all_tensor, is_at_points, use_3d_slices, false)); + } + + // Loop over all elements + code << "\n" << tab << "// Element loop\n"; + code << tab << "__syncthreads();\n"; + code << tab << "for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {\n"; + tab.push(); + + // -- Compute minimum buffer space needed + CeedInt max_rstr_buffer_size = 1; + + for (CeedInt i = 0; i < num_input_fields; i++) { + CeedEvalMode eval_mode; + + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); + if (eval_mode != CEED_EVAL_NONE && eval_mode != CEED_EVAL_WEIGHT) { + CeedInt num_comp; + CeedElemRestriction elem_rstr; + + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr)); + CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); + max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1)); + CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); + } + } + for (CeedInt i = 0; i < num_output_fields; i++) { + CeedEvalMode eval_mode; + + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); + if (eval_mode != CEED_EVAL_NONE) { + CeedInt num_comp; + CeedElemRestriction elem_rstr; + + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr)); + CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); + max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1)); + CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); + } + } + code << tab << "// Scratch restriction buffer space\n"; + code << tab << "CeedScalar r_e_scratch[" << max_rstr_buffer_size << "];\n"; + + // -- Determine best input field processing order + CeedInt field_rstr_in_buffer[CEED_FIELD_MAX], input_field_order[CEED_FIELD_MAX]; + + for (CeedInt i = 0; i < num_input_fields; i++) { + field_rstr_in_buffer[i] = -1; + input_field_order[i] = -1; + } + { + bool is_ordered[CEED_FIELD_MAX]; + CeedInt curr_index = 0; + + for (CeedInt i = 0; i < num_input_fields; i++) is_ordered[i] = false; + for (CeedInt i = 0; i < num_input_fields; i++) { + CeedVector vec_i; + CeedElemRestriction rstr_i; + + if (is_ordered[i]) continue; + field_rstr_in_buffer[i] = i; + is_ordered[i] = true; + input_field_order[curr_index] = i; + curr_index++; + CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i)); + if (vec_i == CEED_VECTOR_NONE) continue; // CEED_EVAL_WEIGHT + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr_i)); + for (CeedInt j = i + 1; j < num_input_fields; j++) { + CeedVector vec_j; + CeedElemRestriction rstr_j; + + CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[j], &vec_j)); + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[j], &rstr_j)); + if (rstr_i == rstr_j && vec_i == vec_j) { + field_rstr_in_buffer[j] = i; + is_ordered[j] = true; + input_field_order[curr_index] = j; + curr_index++; + } + CeedCallBackend(CeedVectorDestroy(&vec_j)); + CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j)); + } + CeedCallBackend(CeedVectorDestroy(&vec_i)); + CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i)); + } + } + + // -- Input restriction and basis + code << "\n" << tab << "// -- Input field restrictions and basis actions\n"; + CeedInt active_field_index = -1; + + for (CeedInt i = 0; i < num_input_fields; i++) { + bool is_active = false; + const char *field_name; + const CeedInt f = input_field_order[i]; + + { + CeedVector vec; + + CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[f], &vec)); + is_active = vec == CEED_VECTOR_ACTIVE; + CeedCallBackend(CeedVectorDestroy(&vec)); + } + + CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[f], &field_name)); + code << tab << "// ---- Input field " << f << ": " << field_name << "\n"; + + if (is_active) { + std::string var_suffix = "_in_" + std::to_string(f); + + code << tab << "// Active field - no restriction or basis action here\n"; + if (active_field_index == -1) { + active_field_index = f; + code << tab << "CeedScalar r_e" << var_suffix << "[num_comp" << var_suffix << "*" << (max_dim >= 3 ? "P_1d" + var_suffix : "1") + << "] = {0.0};\n"; + } else { + code << tab << "CeedScalar *r_e" << var_suffix << " = r_e_in_" << active_field_index << ";\n"; + } + } else { + // ---- Restriction + CeedCallBackend(CeedOperatorBuildKernelRestriction_Cuda_gen(code, data, tab, f, field_rstr_in_buffer, op_input_fields[f], qf_input_fields[f], + max_dim, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices)); + + // ---- Basis action + CeedCallBackend(CeedOperatorBuildKernelBasis_Cuda_gen(code, data, tab, f, op_input_fields[f], qf_input_fields[f], max_dim, Q_1d, true, + is_all_tensor, is_at_points, use_3d_slices)); + } + } + + // -- Loop over active field + std::string active_var_suffix = "_in_" + std::to_string(active_field_index); + + code << "\n" << tab << "// Loop over nodes in active field\n"; + code << tab << "for (CeedInt n = 0; n < num_comp" << active_var_suffix << "*P_1d" << active_var_suffix + << (max_dim > 1 ? "*P_1d" + active_var_suffix : "") << (max_dim > 2 ? "*P_1d" + active_var_suffix : "") << "; n++) {\n"; + tab.push(); + + // -- Set current active node and component to 1 + code << tab << "// Set current active node and component to 1.0\n"; + code << tab << "SetEVecStandard" << max_dim << "d_Single(data, n, 1.0, r_e" + << active_var_suffix << ");\n\n"; + + for (CeedInt i = 0; i < num_input_fields; i++) { + bool is_active = false; + const char *field_name; + const CeedInt f = input_field_order[i]; + + { + CeedVector vec; + + CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[f], &vec)); + is_active = vec == CEED_VECTOR_ACTIVE; + CeedCallBackend(CeedVectorDestroy(&vec)); + } + if (!is_active) continue; + + CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[f], &field_name)); + code << tab << "// ---- Input field " << f << ": " << field_name << "\n"; + + // ---- Basis action + CeedCallBackend(CeedOperatorBuildKernelBasis_Cuda_gen(code, data, tab, f, op_input_fields[f], qf_input_fields[f], max_dim, Q_1d, true, + is_all_tensor, is_at_points, use_3d_slices)); + } + + // -- Q function + CeedCallBackend(CeedOperatorBuildKernelQFunction_Cuda_gen(code, data, tab, max_dim, max_num_points, num_input_fields, op_input_fields, + qf_input_fields, num_output_fields, op_output_fields, qf_output_fields, qfunction_name, + Q_1d, is_all_tensor, is_at_points, use_3d_slices, true)); + + // -- Output basis and restriction + code << "\n" << tab << "// -- Output field basis action and restrictions\n"; + for (CeedInt i = 0; i < num_output_fields; i++) { + bool is_active = false; + const char *field_name; + + { + CeedVector vec; + + CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec)); + is_active = vec == CEED_VECTOR_ACTIVE; + CeedCallBackend(CeedVectorDestroy(&vec)); + } + if (!is_active) continue; + + CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name)); + code << tab << "// ---- Output field " << i << ": " << field_name << "\n"; + + // ---- Basis action + CeedCallBackend(CeedOperatorBuildKernelBasis_Cuda_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], max_dim, Q_1d, false, + is_all_tensor, is_at_points, use_3d_slices)); + + // ---- Restriction + if (is_full) { + std::string var_suffix = "_out_" + std::to_string(i); + CeedInt comp_stride; + CeedSize l_size; + CeedElemRestriction elem_rstr; + + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr)); + CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size)); + code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n"; + CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride)); + code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n"; + code << tab << "WriteLVecStandard" << max_dim << "d_Assembly(data, l_size" << var_suffix << ", elem, n, r_e" << var_suffix << ", values_array);\n"; + CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); + } else { + std::string var_suffix = "_out_" + std::to_string(i); + CeedInt comp_stride; + CeedSize l_size; + CeedElemRestriction elem_rstr; + + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr)); + CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size)); + code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n"; + CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride)); + code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n"; + code << tab << "WriteLVecStandard" << max_dim << "d_Single(data, l_size" << var_suffix << ", elem, n, indices.outputs[" << i << "], r_e" << var_suffix << ", values_array);\n"; + CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); + } + } + + // -- Reset current active node and component + code << "\n" << tab << "// Reset current active node and component to 0.0\n"; + code << tab << "SetEVecStandard" << max_dim << "d_Single(data, n, 0.0, r_e" + << active_var_suffix << ");\n"; + + // -- End of loop over active field + tab.pop(); + code << tab << "}\n"; + + // Close loop and function + tab.pop(); + code << tab << "}\n"; + tab.pop(); + code << tab << "}\n"; + code << tab << "// -----------------------------------------------------------------------------\n\n"; + + // Compile + { + bool is_compile_good = false; + const CeedInt T_1d = CeedIntMax(is_all_tensor ? Q_1d : Q, data->max_P_1d); + + data->thread_1d = T_1d; + CeedCallBackend(CeedTryCompile_Cuda(ceed, code.str().c_str(), &is_compile_good, + is_full ? &data->module_assemble_full : &data->module_assemble_diagonal, 1, "OP_T_1D", T_1d)); + if (is_compile_good) { + *is_good_build = true; + CeedCallBackend(CeedGetKernel_Cuda(ceed, is_full ? data->module_assemble_full : data->module_assemble_diagonal, operator_name.c_str(), + is_full ? &data->assemble_full : &data->assemble_diagonal)); + } else { + *is_good_build = false; + data->use_assembly_fallback = true; + } + } + CeedCallBackend(CeedDestroy(&ceed)); + CeedCallBackend(CeedQFunctionDestroy(&qf)); + return CEED_ERROR_SUCCESS; +} + +extern "C" int CeedOperatorBuildKernelDiagonalAssemblyAtPoints_Cuda_gen(CeedOperator op, bool *is_good_build) { + return CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen(op, false, is_good_build); +} + +extern "C" int CeedOperatorBuildKernelFullAssemblyAtPoints_Cuda_gen(CeedOperator op, bool *is_good_build) { + return CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen(op, true, is_good_build); +} + +//------------------------------------------------------------------------------ +// Build QFunction assembly operator kernel +//------------------------------------------------------------------------------ +extern "C" int CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen(CeedOperator op, bool *is_good_build) { + bool is_all_tensor = true, is_all_nontensor = true, is_at_points = false, use_3d_slices = false; + Ceed ceed; + CeedInt Q, Q_1d, num_input_fields, num_output_fields, max_dim = 1, max_num_points = 0; + CeedQFunctionField *qf_input_fields, *qf_output_fields; + CeedQFunction_Cuda_gen *qf_data; + CeedQFunction qf; + CeedOperatorField *op_input_fields, *op_output_fields; + CeedOperator_Cuda_gen *data; + std::ostringstream code; + Tab tab; + + // Check compatibility + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); + CeedCallBackend(CeedOperatorIsAtPoints(op, &is_at_points)); + CeedCheck(!is_at_points, ceed, CEED_ERROR_BACKEND, "AtPoints QFunction assembly is not supported"); + + // Check field compatibility + CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); + { + bool has_shared_bases = true; + + for (CeedInt i = 0; i < num_input_fields; i++) { + CeedBasis basis; + + CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); + if (basis != CEED_BASIS_NONE) { + bool is_tensor = true; + const char *resource; + char *resource_root; + Ceed basis_ceed; + + CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor)); + is_all_tensor = is_all_tensor && is_tensor; + is_all_nontensor = is_all_nontensor && !is_tensor; + CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed)); + CeedCallBackend(CeedGetResource(basis_ceed, &resource)); + CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root)); + has_shared_bases = has_shared_bases && !strcmp(resource_root, "/gpu/cuda/shared"); + CeedCallBackend(CeedFree(&resource_root)); + CeedCallBackend(CeedDestroy(&basis_ceed)); + } + CeedCallBackend(CeedBasisDestroy(&basis)); + } + + for (CeedInt i = 0; i < num_output_fields; i++) { + CeedBasis basis; + + CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); + if (basis != CEED_BASIS_NONE) { + bool is_tensor = true; + const char *resource; + char *resource_root; + Ceed basis_ceed; + + CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor)); + is_all_tensor = is_all_tensor && is_tensor; + is_all_nontensor = is_all_nontensor && !is_tensor; + + CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed)); + CeedCallBackend(CeedGetResource(basis_ceed, &resource)); + CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root)); + has_shared_bases = has_shared_bases && !strcmp(resource_root, "/gpu/cuda/shared"); + CeedCallBackend(CeedFree(&resource_root)); + CeedCallBackend(CeedDestroy(&basis_ceed)); + } + CeedCallBackend(CeedBasisDestroy(&basis)); + } + } + + // Retrieve operator data + CeedCallBackend(CeedOperatorGetData(op, &data)); + Q = data->Q; + Q_1d = data->Q_1d; + max_dim = data->dim; + CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); + CeedCallBackend(CeedQFunctionGetData(qf, &qf_data)); + CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); + + // Add atomicAdd function for old NVidia architectures + { + Ceed_Cuda *ceed_data; + struct cudaDeviceProp prop; + + CeedCallBackend(CeedGetData(ceed, &ceed_data)); + CeedCallBackend(cudaGetDeviceProperties(&prop, ceed_data->device_id)); + if ((prop.major < 6) && (CEED_SCALAR_TYPE != CEED_SCALAR_FP32)) { + code << tab << "// AtomicAdd fallback source\n"; + code << tab << "#include \n\n"; + } + } + + // Load basis source files + if (!is_all_nontensor) { + code << tab << "// Tensor basis source\n"; + code << tab << "#include \n\n"; + } + if (!is_all_tensor) { + code << tab << "// Non-tensor basis source\n"; + code << tab << "#include \n\n"; + } + if (!is_all_tensor && !is_all_nontensor) { + code << "// Tensor basis source\n"; + code << "#include \n\n"; + } + code << "// CodeGen operator source\n"; + code << "#include \n\n"; + + // Get QFunction name + std::string qfunction_name(qf_data->qfunction_name); + std::string operator_name; + + operator_name = "CeedKernelCudaGenQFunctionAssembly_" + qfunction_name; + + // Define CEED_Q_VLA + code << "\n" << tab << "#undef CEED_Q_VLA\n"; + if (max_dim != 3 || is_at_points || use_3d_slices || !is_all_tensor) { + code << tab << "#define CEED_Q_VLA 1\n\n"; + } else { + code << tab << "#define CEED_Q_VLA " << Q_1d << "\n\n"; + } + + // Add user QFunction source + { + const char *source_path; + + CeedCallBackend(CeedQFunctionGetSourcePath(qf, &source_path)); + CeedCheck(source_path, ceed, CEED_ERROR_UNSUPPORTED, "/gpu/cuda/gen backend requires QFunction source code file"); + + code << tab << "// User QFunction source\n"; + code << tab << "#include \"" << source_path << "\"\n\n"; + } + + // Setup + code << "\n" << tab << "// -----------------------------------------------------------------------------\n"; + code << tab << "// Operator Assembly Kernel\n"; + code << tab << "// \n"; + code << tab << "// d_[in,out]_i: CeedVector device array\n"; + code << tab << "// r_[in,out]_e_i: Element vector register\n"; + code << tab << "// r_[in,out]_q_i: Quadrature space vector register\n"; + code << tab << "// r_[in,out]_c_i: AtPoints Chebyshev coefficients register\n"; + code << tab << "// r_[in,out]_s_i: Quadrature space slice vector register\n"; + code << tab << "// \n"; + code << tab << "// s_B_[in,out]_i: Interpolation matrix, shared memory\n"; + code << tab << "// s_G_[in,out]_i: Gradient matrix, shared memory\n"; + code << tab << "// -----------------------------------------------------------------------------\n"; + code << tab << "extern \"C\" __global__ void " << operator_name + << "(CeedInt num_elem, void* ctx, FieldsInt_Cuda indices, Fields_Cuda fields, Fields_Cuda B, Fields_Cuda G, CeedScalar *W, Points_Cuda " + "points, CeedScalar *__restrict__ values_array) {\n"; + tab.push(); + + // Scratch buffers + for (CeedInt i = 0; i < num_input_fields; i++) { + CeedEvalMode eval_mode; + + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); + if (eval_mode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT + code << tab << "const CeedScalar *__restrict__ d_in_" << i << " = fields.inputs[" << i << "];\n"; + } + } + for (CeedInt i = 0; i < num_output_fields; i++) { + bool is_active = false; + + { + CeedVector vec; + + CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec)); + is_active = vec == CEED_VECTOR_ACTIVE; + CeedCallBackend(CeedVectorDestroy(&vec)); + } + if (is_active) { + code << tab << "CeedScalar *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n"; + } + } + + code << tab << "const CeedInt max_dim = " << max_dim << ";\n"; + if (!is_all_tensor) { + code << tab << "const CeedInt Q = " << Q << ";\n"; + } + if (!is_all_nontensor) { + code << tab << "const CeedInt Q_1d = " << Q_1d << ";\n"; + } + + // Shared data + code << tab << "extern __shared__ CeedScalar slice[];\n"; + code << tab << "SharedData_Cuda data;\n"; + code << tab << "data.t_id_x = threadIdx.x;\n"; + code << tab << "data.t_id_y = threadIdx.y;\n"; + code << tab << "data.t_id_z = threadIdx.z;\n"; + code << tab << "data.t_id = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n"; + code << tab << "data.slice = slice + data.t_id_z*OP_T_1D" << ((!is_all_tensor || max_dim == 1) ? "" : "*OP_T_1D") << ";\n"; + + // -- Determine input mat reuse + FieldReuse_Cuda input_matrix_reuse[CEED_FIELD_MAX]; + + for (CeedInt i = 0; i < num_input_fields; i++) { + input_matrix_reuse[i].index = -1; + } + for (CeedInt i = 0; i < num_input_fields; i++) { + bool is_tensor = true; + CeedEvalMode eval_mode_i; + CeedBasis basis_i; + + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode_i)); + if (eval_mode_i == CEED_EVAL_WEIGHT) continue; + CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis_i)); + CeedCallBackend(CeedBasisIsTensor(basis_i, &is_tensor)); + for (CeedInt j = 0; (input_matrix_reuse[i].index == -1) && (j < i); j++) { + CeedEvalMode eval_mode_j; + CeedBasis basis_j; + + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j)); + if (eval_mode_j == CEED_EVAL_WEIGHT) continue; + CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j)); + if (basis_i == basis_j) { + if (is_tensor) { + input_matrix_reuse[i].index = j; + input_matrix_reuse[i].is_input = true; + input_matrix_reuse[i].eval_mode = eval_mode_j; + } else { + // For non-tensor can only re-use with the same eval mode + if (eval_mode_i == eval_mode_j) { + input_matrix_reuse[i].index = j; + input_matrix_reuse[i].is_input = true; + input_matrix_reuse[i].eval_mode = eval_mode_j; + } + } + } + CeedCallBackend(CeedBasisDestroy(&basis_j)); + } + CeedCallBackend(CeedBasisDestroy(&basis_i)); + } + + // -- Determine output mat reuse + FieldReuse_Cuda output_matrix_reuse[CEED_FIELD_MAX]; + + for (CeedInt i = 0; i < num_output_fields; i++) { + output_matrix_reuse[i].index = -1; + } + for (CeedInt i = 0; i < num_output_fields; i++) { + bool is_tensor = true; + CeedEvalMode eval_mode_i; + CeedBasis basis_i; + + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode_i)); + CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis_i)); + CeedCallBackend(CeedBasisIsTensor(basis_i, &is_tensor)); + for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < num_input_fields); j++) { + CeedEvalMode eval_mode_j; + CeedBasis basis_j; + + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j)); + if (eval_mode_j == CEED_EVAL_WEIGHT) continue; + CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j)); + if (basis_i == basis_j) { + if (is_tensor) { + output_matrix_reuse[i].index = j; + output_matrix_reuse[i].is_input = true; + output_matrix_reuse[i].eval_mode = eval_mode_j; + } else { + // For non-tensor can only re-use with the same eval mode + if (eval_mode_i == eval_mode_j) { + output_matrix_reuse[i].index = j; + output_matrix_reuse[i].is_input = true; + output_matrix_reuse[i].eval_mode = eval_mode_j; + } + } + } + CeedCallBackend(CeedBasisDestroy(&basis_j)); + } + for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < i); j++) { + CeedEvalMode eval_mode_j; + CeedBasis basis_j; + + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[j], &eval_mode_j)); + if (eval_mode_j == CEED_EVAL_WEIGHT) continue; + CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[j], &basis_j)); + if (basis_i == basis_j) { + if (is_tensor) { + output_matrix_reuse[i].index = j; + output_matrix_reuse[i].is_input = false; + output_matrix_reuse[i].eval_mode = eval_mode_j; + } else { + // For non-tensor can only re-use with the same eval mode + if (eval_mode_i == eval_mode_j) { + output_matrix_reuse[i].index = j; + output_matrix_reuse[i].is_input = false; + output_matrix_reuse[i].eval_mode = eval_mode_j; + } + } + } + CeedCallBackend(CeedBasisDestroy(&basis_j)); + } + CeedCallBackend(CeedBasisDestroy(&basis_i)); + } + + // Initialize constants, and matrices B and G + code << "\n" << tab << "// Input field constants and basis data\n"; + for (CeedInt i = 0; i < num_input_fields; i++) { + CeedCallBackend(CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, tab, i, op_input_fields[i], qf_input_fields[i], input_matrix_reuse[i], + max_dim, Q, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices, true)); + } + code << "\n" << tab << "// Output field constants and basis data\n"; + for (CeedInt i = 0; i < num_output_fields; i++) { + CeedCallBackend(CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], output_matrix_reuse[i], + max_dim, Q, Q_1d, false, is_all_tensor, is_at_points, use_3d_slices, true)); + } + + // Loop over all elements + code << "\n" << tab << "// Element loop\n"; + code << tab << "__syncthreads();\n"; + code << tab << "for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {\n"; + tab.push(); + + // -- Compute minimum buffer space needed + CeedInt max_rstr_buffer_size = 1; + + for (CeedInt i = 0; i < num_input_fields; i++) { + CeedEvalMode eval_mode; + + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); + if (eval_mode != CEED_EVAL_NONE && eval_mode != CEED_EVAL_WEIGHT) { + CeedInt num_comp; + CeedElemRestriction elem_rstr; + + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr)); + CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); + max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1)); + CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); + } + } + for (CeedInt i = 0; i < num_output_fields; i++) { + CeedEvalMode eval_mode; + + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); + if (eval_mode != CEED_EVAL_NONE) { + CeedInt num_comp; + CeedElemRestriction elem_rstr; + + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr)); + CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); + max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1)); + CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); + } + } + code << tab << "// Scratch restriction buffer space\n"; + code << tab << "CeedScalar r_e_scratch[" << max_rstr_buffer_size << "];\n"; + + // -- Determine best input field processing order + CeedInt field_rstr_in_buffer[CEED_FIELD_MAX], input_field_order[CEED_FIELD_MAX]; + + for (CeedInt i = 0; i < num_input_fields; i++) { + field_rstr_in_buffer[i] = -1; + input_field_order[i] = -1; + } + { + bool is_ordered[CEED_FIELD_MAX]; + CeedInt curr_index = 0; + + for (CeedInt i = 0; i < num_input_fields; i++) is_ordered[i] = false; + for (CeedInt i = 0; i < num_input_fields; i++) { + CeedVector vec_i; + CeedElemRestriction rstr_i; + + if (is_ordered[i]) continue; + field_rstr_in_buffer[i] = i; + is_ordered[i] = true; + input_field_order[curr_index] = i; + curr_index++; + CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i)); + if (vec_i == CEED_VECTOR_NONE) continue; // CEED_EVAL_WEIGHT + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr_i)); + for (CeedInt j = i + 1; j < num_input_fields; j++) { + CeedVector vec_j; + CeedElemRestriction rstr_j; + + CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[j], &vec_j)); + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[j], &rstr_j)); + if (rstr_i == rstr_j && vec_i == vec_j) { + field_rstr_in_buffer[j] = i; + is_ordered[j] = true; + input_field_order[curr_index] = j; + curr_index++; + } + CeedCallBackend(CeedVectorDestroy(&vec_j)); + CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j)); + } + CeedCallBackend(CeedVectorDestroy(&vec_i)); + CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i)); + } + } + + // -- Input restriction and basis + code << "\n" << tab << "// -- Input field restrictions and basis actions\n"; + CeedInt num_active_in = 0, num_active_out = 0, qf_assembly_size_out = 0; + CeedInt active_fields_in[CEED_FIELD_MAX], active_fields_out[CEED_FIELD_MAX]; + + for (CeedInt i = 0; i < num_input_fields; i++) { + bool is_active = false; + const char *field_name; + const CeedInt f = input_field_order[i]; + + { + CeedVector vec; + + CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[f], &vec)); + is_active = vec == CEED_VECTOR_ACTIVE; + CeedCallBackend(CeedVectorDestroy(&vec)); + } + + CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[f], &field_name)); + code << tab << "// ---- Input field " << f << ": " << field_name << "\n"; + + if (is_active) { + CeedEvalMode eval_mode; + CeedInt field_size; + + active_fields_in[num_active_in] = f; + num_active_in++; + CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[f], &field_size)); + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[f], &eval_mode)); + if (eval_mode == CEED_EVAL_GRAD) { + code << tab << "CeedScalar r_q_in_" << f << "[num_comp_in_" << f << "*" << "dim_in_" << f << "*" + << (is_all_tensor && (max_dim >= 3) ? "Q_1d" : "1") << "] = {0.};\n"; + } else { + code << tab << "CeedScalar r_q_in_" << f << "[num_comp_in_" << f << "*" << (is_all_tensor && (max_dim >= 3) ? "Q_1d" : "1") << "] = {0.};\n"; + } + code << tab << "const CeedInt field_size_in_" << f << " = " << field_size << ";\n"; + } else { + // ---- Restriction + CeedCallBackend(CeedOperatorBuildKernelRestriction_Cuda_gen(code, data, tab, f, field_rstr_in_buffer, op_input_fields[f], qf_input_fields[f], + max_dim, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices)); + + // ---- Basis action + CeedCallBackend(CeedOperatorBuildKernelBasis_Cuda_gen(code, data, tab, f, op_input_fields[f], qf_input_fields[f], max_dim, Q_1d, true, + is_all_tensor, is_at_points, use_3d_slices)); + } + } + code << tab << "const CeedInt field_sizes_in[" << num_active_in << "] = {"; + for (CeedInt i = 0; i < num_active_in; i++) { + code << "field_size_in_" << active_fields_in[i] << (i < num_active_in - 1 ? ", " : ""); + } + code << "};\n"; + code << tab << "CeedScalar * r_q_in[" << num_active_in << "] = {"; + for (CeedInt i = 0; i < num_active_in; i++) { + code << "r_q_in_" << active_fields_in[i] << (i < num_active_in - 1 ? ", " : ""); + } + code << "};\n"; + + for (CeedInt i = 0; i < num_output_fields; i++) { + bool is_active = false; + + { + CeedVector vec; + + CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec)); + is_active = vec == CEED_VECTOR_ACTIVE; + CeedCallBackend(CeedVectorDestroy(&vec)); + } + if (is_active) { + const char *field_name; + CeedInt field_size; + + active_fields_out[num_active_out] = i; + num_active_out++; + CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &field_size)); + qf_assembly_size_out += field_size; + CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name)); + code << tab << "// ---- Output field " << i << ": " << field_name << "\n"; + code << tab << "const CeedInt field_size_out_" << i << " = " << field_size << ";\n"; + } + } + code << tab << "const CeedInt field_sizes_out[" << num_active_out << "] = {"; + for (CeedInt i = 0; i < num_active_out; i++) { + code << "field_size_out_" << active_fields_out[i] << (i < num_active_out - 1 ? ", " : ""); + } + code << "};\n"; + code << tab << "const CeedInt total_size_out = " << qf_assembly_size_out << ";\n"; + + // -- Loop over active field + code << "\n" << tab << "CeedInt input_offset = 0;\n"; + code << tab << "// Loop over active QFunction input fields\n"; + code << tab << "const CeedInt num_active_in = " << num_active_in << ";\n"; + code << tab << "for (CeedInt a = 0; a < num_active_in; a++) {\n"; + tab.push(); + + // -- Loop over size of active field + code << "\n" << tab << "// Loop over current active input field size\n"; + code << tab << "const CeedInt field_size_in = field_sizes_in[a];\n"; + code << tab << "for (CeedInt s = 0; s < field_size_in; s++) {\n"; + tab.push(); + + // -- Set current active point and component to 1 + code << tab << "// Set current active point and component to 1.0\n"; + if (is_all_tensor && (max_dim >= 3)) { + code << tab << "for (CeedInt i = 0; i < Q_1d; i++) r_q_in[a][i + s * Q_1d] = 1.0;\n"; + } else { + code << tab << "r_q_in[a][s] = 1.0;\n"; + } + + // -- Q function + CeedCallBackend(CeedOperatorBuildKernelQFunction_Cuda_gen(code, data, tab, max_dim, max_num_points, num_input_fields, op_input_fields, + qf_input_fields, num_output_fields, op_output_fields, qf_output_fields, qfunction_name, + Q_1d, is_all_tensor, is_at_points, use_3d_slices, true)); + + // -- Output basis and restriction + code << "\n" << tab << "// -- Output field basis action and restrictions\n"; + CeedScalar offset = 0; + + for (CeedInt i = 0; i < num_output_fields; i++) { + bool is_active = false; + const char *field_name; + + { + CeedVector vec; + + CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec)); + is_active = vec == CEED_VECTOR_ACTIVE; + CeedCallBackend(CeedVectorDestroy(&vec)); + } + if (!is_active) continue; + + CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name)); + code << tab << "// ---- Output field " << i << ": " << field_name << "\n"; + + // ---- Restriction + CeedInt field_size; + + code << tab << "WriteLVecStandard" << (is_all_tensor ? max_dim : 1) << "d_QFAssembly(data, num_elem, elem, input_offset + s, " << offset << ", r_q_out_" << i << ", values_array);\n"; + CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &field_size)); + offset += field_size; + } + + // -- Reset current active node and component + code << "\n" << tab << "// Reset current active node and component to 0.0\n"; + if (is_all_tensor && (max_dim >= 3)) { + code << tab << "for (CeedInt i = 0; i < Q_1d; i++) r_q_in[a][i + s * Q_1d] = 0.0;\n"; + } else { + code << tab << "r_q_in[a][s] = 0.0;\n"; + } + + // -- End of loop over size of active field + tab.pop(); + code << tab << "}\n"; + code << tab << "input_offset += field_size_in;\n"; + + // -- End of loop over active field + tab.pop(); + code << tab << "}\n"; + + // Close loop and function + tab.pop(); + code << tab << "}\n"; + tab.pop(); + code << tab << "}\n"; + code << tab << "// -----------------------------------------------------------------------------\n\n"; + + // Compile + { + bool is_compile_good = false; + const CeedInt T_1d = CeedIntMax(is_all_tensor ? Q_1d : Q, data->max_P_1d); + + data->thread_1d = T_1d; + CeedCallBackend(CeedTryCompile_Cuda(ceed, code.str().c_str(), &is_compile_good, &data->module_assemble_qfunction, 1, "OP_T_1D", T_1d)); + if (is_compile_good) { + *is_good_build = true; + CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module_assemble_qfunction, operator_name.c_str(), &data->assemble_qfunction)); + } else { + *is_good_build = false; + data->use_assembly_fallback = true; + } + } + CeedCallBackend(CeedDestroy(&ceed)); + CeedCallBackend(CeedQFunctionDestroy(&qf)); return CEED_ERROR_SUCCESS; } diff --git a/backends/cuda-gen/ceed-cuda-gen-operator-build.h b/backends/cuda-gen/ceed-cuda-gen-operator-build.h index 28031e8e3b..8fd3ee12c5 100644 --- a/backends/cuda-gen/ceed-cuda-gen-operator-build.h +++ b/backends/cuda-gen/ceed-cuda-gen-operator-build.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -6,4 +6,7 @@ // This file is part of CEED: http://github.com/ceed #pragma once -CEED_INTERN int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op); +CEED_INTERN int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_build); +CEED_INTERN int CeedOperatorBuildKernelFullAssemblyAtPoints_Cuda_gen(CeedOperator op, bool *is_good_build); +CEED_INTERN int CeedOperatorBuildKernelDiagonalAssemblyAtPoints_Cuda_gen(CeedOperator op, bool *is_good_build); +CEED_INTERN int CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen(CeedOperator op, bool *is_good_build); diff --git a/backends/cuda-gen/ceed-cuda-gen-operator.c b/backends/cuda-gen/ceed-cuda-gen-operator.c index 840d97afb9..97fcf6b4b0 100644 --- a/backends/cuda-gen/ceed-cuda-gen-operator.c +++ b/backends/cuda-gen/ceed-cuda-gen-operator.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -8,7 +8,10 @@ #include #include #include +#include +#include #include +#include #include "../cuda/ceed-cuda-common.h" #include "../cuda/ceed-cuda-compile.h" @@ -19,10 +22,18 @@ // Destroy operator //------------------------------------------------------------------------------ static int CeedOperatorDestroy_Cuda_gen(CeedOperator op) { + Ceed ceed; CeedOperator_Cuda_gen *impl; + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); CeedCallBackend(CeedOperatorGetData(op, &impl)); + if (impl->module) CeedCallCuda(ceed, cuModuleUnload(impl->module)); + if (impl->module_assemble_full) CeedCallCuda(ceed, cuModuleUnload(impl->module_assemble_full)); + if (impl->module_assemble_diagonal) CeedCallCuda(ceed, cuModuleUnload(impl->module_assemble_diagonal)); + if (impl->module_assemble_qfunction) CeedCallCuda(ceed, cuModuleUnload(impl->module_assemble_qfunction)); + if (impl->points.num_per_elem) CeedCallCuda(ceed, cudaFree((void **)impl->points.num_per_elem)); CeedCallBackend(CeedFree(&impl)); + CeedCallBackend(CeedDestroy(&ceed)); return CEED_ERROR_SUCCESS; } @@ -91,18 +102,23 @@ static size_t dynamicSMemSize(int threads) { return threads * sizeof(CeedScalar) //------------------------------------------------------------------------------ // Apply and add to output //------------------------------------------------------------------------------ -static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec, CeedVector output_vec, CeedRequest *request) { +static int CeedOperatorApplyAddCore_Cuda_gen(CeedOperator op, CUstream stream, const CeedScalar *input_arr, CeedScalar *output_arr, bool *is_run_good, + CeedRequest *request) { + bool is_at_points, is_tensor; Ceed ceed; Ceed_Cuda *cuda_data; CeedInt num_elem, num_input_fields, num_output_fields; CeedEvalMode eval_mode; - CeedVector output_vecs[CEED_FIELD_MAX] = {NULL}; CeedQFunctionField *qf_input_fields, *qf_output_fields; CeedQFunction_Cuda_gen *qf_data; CeedQFunction qf; CeedOperatorField *op_input_fields, *op_output_fields; CeedOperator_Cuda_gen *data; + // Build the operator kernel + CeedCallBackend(CeedOperatorBuildKernel_Cuda_gen(op, is_run_good)); + if (!(*is_run_good)) return CEED_ERROR_SUCCESS; + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); CeedCallBackend(CeedGetData(ceed, &cuda_data)); CeedCallBackend(CeedOperatorGetData(op, &data)); @@ -112,66 +128,72 @@ static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec, CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); - // Check for tensor-product bases - { - bool has_tensor_bases; - - CeedCallBackend(CeedOperatorHasTensorBases(op, &has_tensor_bases)); - // -- Fallback to ref if not all bases are tensor-product - if (!has_tensor_bases) { - CeedOperator op_fallback; - - CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "Falling back to /gpu/cuda/ref CeedOperator due to non-tensor bases"); - CeedCallBackend(CeedOperatorGetFallback(op, &op_fallback)); - CeedCallBackend(CeedOperatorApplyAdd(op_fallback, input_vec, output_vec, request)); - return CEED_ERROR_SUCCESS; - } - } - - // Creation of the operator - CeedCallBackend(CeedOperatorBuildKernel_Cuda_gen(op)); - // Input vectors for (CeedInt i = 0; i < num_input_fields; i++) { - CeedVector vec; - CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); if (eval_mode == CEED_EVAL_WEIGHT) { // Skip data->fields.inputs[i] = NULL; } else { + bool is_active; + CeedVector vec; + // Get input vector CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); - if (vec == CEED_VECTOR_ACTIVE) vec = input_vec; - CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->fields.inputs[i])); + is_active = vec == CEED_VECTOR_ACTIVE; + if (is_active) data->fields.inputs[i] = input_arr; + else CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->fields.inputs[i])); + CeedCallBackend(CeedVectorDestroy(&vec)); } } // Output vectors for (CeedInt i = 0; i < num_output_fields; i++) { - CeedVector vec; - CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); if (eval_mode == CEED_EVAL_WEIGHT) { // Skip data->fields.outputs[i] = NULL; } else { + bool is_active; + CeedVector vec; + // Get output vector CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec)); - if (vec == CEED_VECTOR_ACTIVE) vec = output_vec; - output_vecs[i] = vec; - // Check for multiple output modes - CeedInt index = -1; - - for (CeedInt j = 0; j < i; j++) { - if (vec == output_vecs[j]) { - index = j; - break; - } - } - if (index == -1) { - CeedCallBackend(CeedVectorGetArray(vec, CEED_MEM_DEVICE, &data->fields.outputs[i])); - } else { - data->fields.outputs[i] = data->fields.outputs[index]; + is_active = vec == CEED_VECTOR_ACTIVE; + if (is_active) data->fields.outputs[i] = output_arr; + else CeedCallBackend(CeedVectorGetArray(vec, CEED_MEM_DEVICE, &data->fields.outputs[i])); + CeedCallBackend(CeedVectorDestroy(&vec)); + } + } + + // Point coordinates, if needed + CeedCallBackend(CeedOperatorIsAtPoints(op, &is_at_points)); + if (is_at_points) { + // Coords + CeedVector vec; + + CeedCallBackend(CeedOperatorAtPointsGetPoints(op, NULL, &vec)); + CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->points.coords)); + CeedCallBackend(CeedVectorDestroy(&vec)); + + // Points per elem + if (num_elem != data->points.num_elem) { + CeedInt *points_per_elem; + const CeedInt num_bytes = num_elem * sizeof(CeedInt); + CeedElemRestriction rstr_points = NULL; + + data->points.num_elem = num_elem; + CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL)); + CeedCallBackend(CeedCalloc(num_elem, &points_per_elem)); + for (CeedInt e = 0; e < num_elem; e++) { + CeedInt num_points_elem; + + CeedCallBackend(CeedElemRestrictionGetNumPointsInElement(rstr_points, e, &num_points_elem)); + points_per_elem[e] = num_points_elem; } + if (data->points.num_per_elem) CeedCallCuda(ceed, cudaFree((void **)data->points.num_per_elem)); + CeedCallCuda(ceed, cudaMalloc((void **)&data->points.num_per_elem, num_bytes)); + CeedCallCuda(ceed, cudaMemcpy((void *)data->points.num_per_elem, points_per_elem, num_bytes, cudaMemcpyHostToDevice)); + CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points)); + CeedCallBackend(CeedFree(&points_per_elem)); } } @@ -179,66 +201,661 @@ static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec, CeedCallBackend(CeedQFunctionGetInnerContextData(qf, CEED_MEM_DEVICE, &qf_data->d_c)); // Apply operator - void *opargs[] = {(void *)&num_elem, &qf_data->d_c, &data->indices, &data->fields, &data->B, &data->G, &data->W}; - const CeedInt dim = data->dim; - const CeedInt Q_1d = data->Q_1d; - const CeedInt P_1d = data->max_P_1d; - const CeedInt thread_1d = CeedIntMax(Q_1d, P_1d); - int max_threads_per_block, min_grid_size; + void *opargs[] = {(void *)&num_elem, &qf_data->d_c, &data->indices, &data->fields, &data->B, &data->G, &data->W, &data->points}; + int max_threads_per_block, min_grid_size, grid; + CeedCallBackend(CeedOperatorHasTensorBases(op, &is_tensor)); CeedCallCuda(ceed, cuOccupancyMaxPotentialBlockSize(&min_grid_size, &max_threads_per_block, data->op, dynamicSMemSize, 0, 0x10000)); - int block[3] = - { - thread_1d, - dim < 2 ? 1 : thread_1d, - -1, - }, - grid; - - CeedCallBackend(BlockGridCalculate(num_elem, min_grid_size / cuda_data->device_prop.multiProcessorCount, max_threads_per_block, - cuda_data->device_prop.maxThreadsDim[2], cuda_data->device_prop.warpSize, block, &grid)); + int block[3] = {data->thread_1d, ((!is_tensor || data->dim == 1) ? 1 : data->thread_1d), -1}; + + if (is_tensor) { + CeedCallBackend(BlockGridCalculate(num_elem, min_grid_size / cuda_data->device_prop.multiProcessorCount, max_threads_per_block, + cuda_data->device_prop.maxThreadsDim[2], cuda_data->device_prop.warpSize, block, &grid)); + } else { + CeedInt elems_per_block = CeedIntMin(cuda_data->device_prop.maxThreadsDim[2], CeedIntMax(512 / data->thread_1d, 1)); + + grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); + block[2] = elems_per_block; + } CeedInt shared_mem = block[0] * block[1] * block[2] * sizeof(CeedScalar); - CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->op, grid, block[0], block[1], block[2], shared_mem, opargs)); + CeedCallBackend(CeedTryRunKernelDimShared_Cuda(ceed, data->op, stream, grid, block[0], block[1], block[2], shared_mem, is_run_good, opargs)); // Restore input arrays for (CeedInt i = 0; i < num_input_fields; i++) { - CeedVector vec; - CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); if (eval_mode == CEED_EVAL_WEIGHT) { // Skip } else { + bool is_active; + CeedVector vec; + CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); - if (vec == CEED_VECTOR_ACTIVE) vec = input_vec; - CeedCallBackend(CeedVectorRestoreArrayRead(vec, &data->fields.inputs[i])); + is_active = vec == CEED_VECTOR_ACTIVE; + if (!is_active) CeedCallBackend(CeedVectorRestoreArrayRead(vec, &data->fields.inputs[i])); + CeedCallBackend(CeedVectorDestroy(&vec)); } } // Restore output arrays for (CeedInt i = 0; i < num_output_fields; i++) { - CeedVector vec; - CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); if (eval_mode == CEED_EVAL_WEIGHT) { // Skip } else { + bool is_active; + CeedVector vec; + CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec)); - if (vec == CEED_VECTOR_ACTIVE) vec = output_vec; - // Check for multiple output modes - CeedInt index = -1; - for (CeedInt j = 0; j < i; j++) { - if (vec == output_vecs[j]) { - index = j; - break; + is_active = vec == CEED_VECTOR_ACTIVE; + if (!is_active) CeedCallBackend(CeedVectorRestoreArray(vec, &data->fields.outputs[i])); + CeedCallBackend(CeedVectorDestroy(&vec)); + } + } + + // Restore point coordinates, if needed + if (is_at_points) { + CeedVector vec; + + CeedCallBackend(CeedOperatorAtPointsGetPoints(op, NULL, &vec)); + CeedCallBackend(CeedVectorRestoreArrayRead(vec, &data->points.coords)); + CeedCallBackend(CeedVectorDestroy(&vec)); + } + + // Restore context data + CeedCallBackend(CeedQFunctionRestoreInnerContextData(qf, &qf_data->d_c)); + + // Cleanup + CeedCallBackend(CeedDestroy(&ceed)); + CeedCallBackend(CeedQFunctionDestroy(&qf)); + if (!(*is_run_good)) data->use_fallback = true; + return CEED_ERROR_SUCCESS; +} + +static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec, CeedVector output_vec, CeedRequest *request) { + bool is_run_good = false; + const CeedScalar *input_arr = NULL; + CeedScalar *output_arr = NULL; + + // Try to run kernel + if (input_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(input_vec, CEED_MEM_DEVICE, &input_arr)); + if (output_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArray(output_vec, CEED_MEM_DEVICE, &output_arr)); + CeedCallBackend(CeedOperatorApplyAddCore_Cuda_gen(op, NULL, input_arr, output_arr, &is_run_good, request)); + if (input_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorRestoreArrayRead(input_vec, &input_arr)); + if (output_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorRestoreArray(output_vec, &output_arr)); + + // Fallback on unsuccessful run + if (!is_run_good) { + CeedOperator op_fallback; + + CeedDebug(CeedOperatorReturnCeed(op), "\nFalling back to /gpu/cuda/ref CeedOperator for ApplyAdd\n"); + CeedCallBackend(CeedOperatorGetFallback(op, &op_fallback)); + CeedCallBackend(CeedOperatorApplyAdd(op_fallback, input_vec, output_vec, request)); + } + return CEED_ERROR_SUCCESS; +} + +static int CeedOperatorApplyAddComposite_Cuda_gen(CeedOperator op, CeedVector input_vec, CeedVector output_vec, CeedRequest *request) { + bool is_run_good[CEED_COMPOSITE_MAX] = {false}, is_sequential; + CeedInt num_suboperators; + const CeedScalar *input_arr = NULL; + CeedScalar *output_arr = NULL; + Ceed ceed; + CeedOperator *sub_operators; + cudaStream_t stream = NULL; + + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); + CeedCall(CeedOperatorCompositeGetNumSub(op, &num_suboperators)); + CeedCall(CeedOperatorCompositeGetSubList(op, &sub_operators)); + CeedCall(CeedOperatorCompositeIsSequential(op, &is_sequential)); + if (input_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(input_vec, CEED_MEM_DEVICE, &input_arr)); + if (output_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArray(output_vec, CEED_MEM_DEVICE, &output_arr)); + if (is_sequential) CeedCallCuda(ceed, cudaStreamCreate(&stream)); + for (CeedInt i = 0; i < num_suboperators; i++) { + CeedInt num_elem = 0; + + CeedCall(CeedOperatorGetNumElements(sub_operators[i], &num_elem)); + if (num_elem > 0) { + if (!is_sequential) CeedCallCuda(ceed, cudaStreamCreate(&stream)); + CeedCallBackend(CeedOperatorApplyAddCore_Cuda_gen(sub_operators[i], stream, input_arr, output_arr, &is_run_good[i], request)); + if (!is_sequential) CeedCallCuda(ceed, cudaStreamDestroy(stream)); + } + } + if (is_sequential) CeedCallCuda(ceed, cudaStreamDestroy(stream)); + if (input_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorRestoreArrayRead(input_vec, &input_arr)); + if (output_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorRestoreArray(output_vec, &output_arr)); + CeedCallCuda(ceed, cudaDeviceSynchronize()); + + // Fallback on unsuccessful run + for (CeedInt i = 0; i < num_suboperators; i++) { + if (!is_run_good[i]) { + CeedOperator op_fallback; + + CeedDebug(ceed, "\nFalling back to /gpu/cuda/ref CeedOperator for ApplyAdd\n"); + CeedCallBackend(CeedOperatorGetFallback(sub_operators[i], &op_fallback)); + CeedCallBackend(CeedOperatorApplyAdd(op_fallback, input_vec, output_vec, request)); + } + } + CeedCallBackend(CeedDestroy(&ceed)); + return CEED_ERROR_SUCCESS; +} + +//------------------------------------------------------------------------------ +// QFunction assembly +//------------------------------------------------------------------------------ +static int CeedOperatorLinearAssembleQFunctionCore_Cuda_gen(CeedOperator op, bool build_objects, CeedVector *assembled, CeedElemRestriction *rstr, + CeedRequest *request) { + Ceed ceed; + CeedOperator_Cuda_gen *data; + + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); + CeedCallBackend(CeedOperatorGetData(op, &data)); + + // Build the assembly kernel + if (!data->assemble_qfunction && !data->use_assembly_fallback) { + bool is_build_good = false; + + CeedCallBackend(CeedOperatorBuildKernel_Cuda_gen(op, &is_build_good)); + if (is_build_good) CeedCallBackend(CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen(op, &is_build_good)); + if (!is_build_good) data->use_assembly_fallback = true; + } + + // Try assembly + if (!data->use_assembly_fallback) { + bool is_run_good = true; + Ceed_Cuda *cuda_data; + CeedInt num_elem, num_input_fields, num_output_fields; + CeedEvalMode eval_mode; + CeedScalar *assembled_array; + CeedQFunctionField *qf_input_fields, *qf_output_fields; + CeedQFunction_Cuda_gen *qf_data; + CeedQFunction qf; + CeedOperatorField *op_input_fields, *op_output_fields; + + CeedCallBackend(CeedGetData(ceed, &cuda_data)); + CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); + CeedCallBackend(CeedQFunctionGetData(qf, &qf_data)); + CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem)); + CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); + CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); + + // Input vectors + for (CeedInt i = 0; i < num_input_fields; i++) { + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); + if (eval_mode == CEED_EVAL_WEIGHT) { // Skip + data->fields.inputs[i] = NULL; + } else { + bool is_active; + CeedVector vec; + + // Get input vector + CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); + is_active = vec == CEED_VECTOR_ACTIVE; + if (is_active) data->fields.inputs[i] = NULL; + else CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->fields.inputs[i])); + CeedCallBackend(CeedVectorDestroy(&vec)); + } + } + + // Get context data + CeedCallBackend(CeedQFunctionGetInnerContextData(qf, CEED_MEM_DEVICE, &qf_data->d_c)); + + // Build objects if needed + if (build_objects) { + CeedInt qf_size_in = 0, qf_size_out = 0, Q; + + // Count number of active input fields + { + for (CeedInt i = 0; i < num_input_fields; i++) { + CeedInt field_size; + CeedVector vec; + + // Get input vector + CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); + // Check if active input + if (vec == CEED_VECTOR_ACTIVE) { + CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &field_size)); + qf_size_in += field_size; + } + CeedCallBackend(CeedVectorDestroy(&vec)); + } + CeedCheck(qf_size_in > 0, ceed, CEED_ERROR_BACKEND, "Cannot assemble QFunction without active inputs and outputs"); + } + + // Count number of active output fields + { + for (CeedInt i = 0; i < num_output_fields; i++) { + CeedInt field_size; + CeedVector vec; + + // Get output vector + CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec)); + // Check if active output + if (vec == CEED_VECTOR_ACTIVE) { + CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &field_size)); + qf_size_out += field_size; + } + CeedCallBackend(CeedVectorDestroy(&vec)); + } + CeedCheck(qf_size_out > 0, ceed, CEED_ERROR_BACKEND, "Cannot assemble QFunction without active inputs and outputs"); + } + CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q)); + + // Actually build objects now + const CeedSize l_size = (CeedSize)num_elem * Q * qf_size_in * qf_size_out; + CeedInt strides[3] = {1, num_elem * Q, Q}; /* *NOPAD* */ + + // Create output restriction + CeedCallBackend(CeedElemRestrictionCreateStrided(ceed, num_elem, Q, qf_size_in * qf_size_out, + (CeedSize)qf_size_in * (CeedSize)qf_size_out * (CeedSize)num_elem * (CeedSize)Q, strides, + rstr)); + // Create assembled vector + CeedCallBackend(CeedVectorCreate(ceed, l_size, assembled)); + } + + // Assembly array + CeedCallBackend(CeedVectorGetArrayWrite(*assembled, CEED_MEM_DEVICE, &assembled_array)); + + // Assemble QFunction + void *opargs[] = {(void *)&num_elem, &qf_data->d_c, &data->indices, &data->fields, &data->B, &data->G, &data->W, &data->points, &assembled_array}; + bool is_tensor = false; + int max_threads_per_block, min_grid_size, grid; + + CeedCallBackend(CeedOperatorHasTensorBases(op, &is_tensor)); + CeedCallCuda(ceed, cuOccupancyMaxPotentialBlockSize(&min_grid_size, &max_threads_per_block, data->op, dynamicSMemSize, 0, 0x10000)); + int block[3] = {data->thread_1d, ((!is_tensor || data->dim == 1) ? 1 : data->thread_1d), -1}; + + if (is_tensor) { + CeedCallBackend(BlockGridCalculate(num_elem, min_grid_size / cuda_data->device_prop.multiProcessorCount, max_threads_per_block, + cuda_data->device_prop.maxThreadsDim[2], cuda_data->device_prop.warpSize, block, &grid)); + } else { + CeedInt elems_per_block = CeedIntMin(cuda_data->device_prop.maxThreadsDim[2], CeedIntMax(512 / data->thread_1d, 1)); + + grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); + block[2] = elems_per_block; + } + CeedInt shared_mem = block[0] * block[1] * block[2] * sizeof(CeedScalar); + + CeedCallBackend(CeedTryRunKernelDimShared_Cuda(ceed, data->assemble_qfunction, NULL, grid, block[0], block[1], block[2], shared_mem, &is_run_good, + opargs)); + CeedCallCuda(ceed, cudaDeviceSynchronize()); + + // Restore input arrays + for (CeedInt i = 0; i < num_input_fields; i++) { + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); + if (eval_mode == CEED_EVAL_WEIGHT) { // Skip + } else { + bool is_active; + CeedVector vec; + + CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); + is_active = vec == CEED_VECTOR_ACTIVE; + if (!is_active) CeedCallBackend(CeedVectorRestoreArrayRead(vec, &data->fields.inputs[i])); + CeedCallBackend(CeedVectorDestroy(&vec)); + } + } + + // Restore context data + CeedCallBackend(CeedQFunctionRestoreInnerContextData(qf, &qf_data->d_c)); + + // Restore assembly array + CeedCallBackend(CeedVectorRestoreArray(*assembled, &assembled_array)); + + // Cleanup + CeedCallBackend(CeedQFunctionDestroy(&qf)); + if (!is_run_good) { + data->use_assembly_fallback = true; + if (build_objects) { + CeedCallBackend(CeedVectorDestroy(assembled)); + CeedCallBackend(CeedElemRestrictionDestroy(rstr)); + } + } + } + CeedCallBackend(CeedDestroy(&ceed)); + + // Fallback, if needed + if (data->use_assembly_fallback) { + CeedOperator op_fallback; + + CeedDebug(CeedOperatorReturnCeed(op), "\nFalling back to /gpu/cuda/ref CeedOperator for LinearAssemblyQFunction\n"); + CeedCallBackend(CeedOperatorGetFallback(op, &op_fallback)); + CeedCallBackend(CeedOperatorLinearAssembleQFunctionBuildOrUpdateFallback(op_fallback, assembled, rstr, request)); + return CEED_ERROR_SUCCESS; + } + return CEED_ERROR_SUCCESS; +} + +static int CeedOperatorLinearAssembleQFunction_Cuda_gen(CeedOperator op, CeedVector *assembled, CeedElemRestriction *rstr, CeedRequest *request) { + return CeedOperatorLinearAssembleQFunctionCore_Cuda_gen(op, true, assembled, rstr, request); +} + +static int CeedOperatorLinearAssembleQFunctionUpdate_Cuda_gen(CeedOperator op, CeedVector assembled, CeedElemRestriction rstr, CeedRequest *request) { + return CeedOperatorLinearAssembleQFunctionCore_Cuda_gen(op, false, &assembled, &rstr, request); +} + +//------------------------------------------------------------------------------ +// AtPoints diagonal assembly +//------------------------------------------------------------------------------ +static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda_gen(CeedOperator op, CeedVector assembled, CeedRequest *request) { + Ceed ceed; + CeedOperator_Cuda_gen *data; + + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); + CeedCallBackend(CeedOperatorGetData(op, &data)); + + // Build the assembly kernel + if (!data->assemble_diagonal && !data->use_assembly_fallback) { + bool is_build_good = false; + CeedInt num_active_bases_in, num_active_bases_out; + CeedOperatorAssemblyData assembly_data; + + CeedCallBackend(CeedOperatorGetOperatorAssemblyData(op, &assembly_data)); + CeedCallBackend(CeedOperatorAssemblyDataGetEvalModes(assembly_data, &num_active_bases_in, NULL, NULL, NULL, &num_active_bases_out, NULL, NULL, + NULL, NULL)); + if (num_active_bases_in == num_active_bases_out) { + CeedCallBackend(CeedOperatorBuildKernel_Cuda_gen(op, &is_build_good)); + if (is_build_good) CeedCallBackend(CeedOperatorBuildKernelDiagonalAssemblyAtPoints_Cuda_gen(op, &is_build_good)); + } + if (!is_build_good) data->use_assembly_fallback = true; + } + + // Try assembly + if (!data->use_assembly_fallback) { + bool is_run_good = true; + Ceed_Cuda *cuda_data; + CeedInt num_elem, num_input_fields, num_output_fields; + CeedEvalMode eval_mode; + CeedScalar *assembled_array; + CeedQFunctionField *qf_input_fields, *qf_output_fields; + CeedQFunction_Cuda_gen *qf_data; + CeedQFunction qf; + CeedOperatorField *op_input_fields, *op_output_fields; + + CeedCallBackend(CeedGetData(ceed, &cuda_data)); + CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); + CeedCallBackend(CeedQFunctionGetData(qf, &qf_data)); + CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem)); + CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); + CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); + + // Input vectors + for (CeedInt i = 0; i < num_input_fields; i++) { + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); + if (eval_mode == CEED_EVAL_WEIGHT) { // Skip + data->fields.inputs[i] = NULL; + } else { + bool is_active; + CeedVector vec; + + // Get input vector + CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); + is_active = vec == CEED_VECTOR_ACTIVE; + if (is_active) data->fields.inputs[i] = NULL; + else CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->fields.inputs[i])); + CeedCallBackend(CeedVectorDestroy(&vec)); + } + } + + // Point coordinates + { + CeedVector vec; + + CeedCallBackend(CeedOperatorAtPointsGetPoints(op, NULL, &vec)); + CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->points.coords)); + CeedCallBackend(CeedVectorDestroy(&vec)); + + // Points per elem + if (num_elem != data->points.num_elem) { + CeedInt *points_per_elem; + const CeedInt num_bytes = num_elem * sizeof(CeedInt); + CeedElemRestriction rstr_points = NULL; + + data->points.num_elem = num_elem; + CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL)); + CeedCallBackend(CeedCalloc(num_elem, &points_per_elem)); + for (CeedInt e = 0; e < num_elem; e++) { + CeedInt num_points_elem; + + CeedCallBackend(CeedElemRestrictionGetNumPointsInElement(rstr_points, e, &num_points_elem)); + points_per_elem[e] = num_points_elem; + } + if (data->points.num_per_elem) CeedCallCuda(ceed, cudaFree((void **)data->points.num_per_elem)); + CeedCallCuda(ceed, cudaMalloc((void **)&data->points.num_per_elem, num_bytes)); + CeedCallCuda(ceed, cudaMemcpy((void *)data->points.num_per_elem, points_per_elem, num_bytes, cudaMemcpyHostToDevice)); + CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points)); + CeedCallBackend(CeedFree(&points_per_elem)); + } + } + + // Get context data + CeedCallBackend(CeedQFunctionGetInnerContextData(qf, CEED_MEM_DEVICE, &qf_data->d_c)); + + // Assembly array + CeedCallBackend(CeedVectorGetArray(assembled, CEED_MEM_DEVICE, &assembled_array)); + + // Assemble diagonal + void *opargs[] = {(void *)&num_elem, &qf_data->d_c, &data->indices, &data->fields, &data->B, &data->G, &data->W, &data->points, &assembled_array}; + int max_threads_per_block, min_grid_size, grid; + + CeedCallCuda(ceed, cuOccupancyMaxPotentialBlockSize(&min_grid_size, &max_threads_per_block, data->op, dynamicSMemSize, 0, 0x10000)); + int block[3] = {data->thread_1d, (data->dim == 1 ? 1 : data->thread_1d), -1}; + + CeedCallBackend(BlockGridCalculate(num_elem, min_grid_size / cuda_data->device_prop.multiProcessorCount, 1, + cuda_data->device_prop.maxThreadsDim[2], cuda_data->device_prop.warpSize, block, &grid)); + CeedInt shared_mem = block[0] * block[1] * block[2] * sizeof(CeedScalar); + + CeedCallBackend(CeedTryRunKernelDimShared_Cuda(ceed, data->assemble_diagonal, NULL, grid, block[0], block[1], block[2], shared_mem, &is_run_good, + opargs)); + CeedCallCuda(ceed, cudaDeviceSynchronize()); + + // Restore input arrays + for (CeedInt i = 0; i < num_input_fields; i++) { + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); + if (eval_mode == CEED_EVAL_WEIGHT) { // Skip + } else { + bool is_active; + CeedVector vec; + + CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); + is_active = vec == CEED_VECTOR_ACTIVE; + if (!is_active) CeedCallBackend(CeedVectorRestoreArrayRead(vec, &data->fields.inputs[i])); + CeedCallBackend(CeedVectorDestroy(&vec)); + } + } + + // Restore point coordinates + { + CeedVector vec; + + CeedCallBackend(CeedOperatorAtPointsGetPoints(op, NULL, &vec)); + CeedCallBackend(CeedVectorRestoreArrayRead(vec, &data->points.coords)); + CeedCallBackend(CeedVectorDestroy(&vec)); + } + + // Restore context data + CeedCallBackend(CeedQFunctionRestoreInnerContextData(qf, &qf_data->d_c)); + + // Restore assembly array + CeedCallBackend(CeedVectorRestoreArray(assembled, &assembled_array)); + + // Cleanup + CeedCallBackend(CeedQFunctionDestroy(&qf)); + if (!is_run_good) data->use_assembly_fallback = true; + } + CeedCallBackend(CeedDestroy(&ceed)); + + // Fallback, if needed + if (data->use_assembly_fallback) { + CeedOperator op_fallback; + + CeedDebug(CeedOperatorReturnCeed(op), "\nFalling back to /gpu/cuda/ref CeedOperator for AtPoints LinearAssembleAddDiagonal\n"); + CeedCallBackend(CeedOperatorGetFallback(op, &op_fallback)); + CeedCallBackend(CeedOperatorLinearAssembleAddDiagonal(op_fallback, assembled, request)); + return CEED_ERROR_SUCCESS; + } + return CEED_ERROR_SUCCESS; +} + +//------------------------------------------------------------------------------ +// AtPoints full assembly +//------------------------------------------------------------------------------ +static int CeedOperatorAssembleSingleAtPoints_Cuda_gen(CeedOperator op, CeedInt offset, CeedVector assembled) { + Ceed ceed; + CeedOperator_Cuda_gen *data; + + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); + CeedCallBackend(CeedOperatorGetData(op, &data)); + + // Build the assembly kernel + if (!data->assemble_full && !data->use_assembly_fallback) { + bool is_build_good = false; + CeedInt num_active_bases_in, num_active_bases_out; + CeedOperatorAssemblyData assembly_data; + + CeedCallBackend(CeedOperatorGetOperatorAssemblyData(op, &assembly_data)); + CeedCallBackend(CeedOperatorAssemblyDataGetEvalModes(assembly_data, &num_active_bases_in, NULL, NULL, NULL, &num_active_bases_out, NULL, NULL, + NULL, NULL)); + if (num_active_bases_in == num_active_bases_out) { + CeedCallBackend(CeedOperatorBuildKernel_Cuda_gen(op, &is_build_good)); + if (is_build_good) CeedCallBackend(CeedOperatorBuildKernelFullAssemblyAtPoints_Cuda_gen(op, &is_build_good)); + } + if (!is_build_good) data->use_assembly_fallback = true; + } + + // Try assembly + if (!data->use_assembly_fallback) { + bool is_run_good = true; + Ceed_Cuda *cuda_data; + CeedInt num_elem, num_input_fields, num_output_fields; + CeedEvalMode eval_mode; + CeedScalar *assembled_array; + CeedQFunctionField *qf_input_fields, *qf_output_fields; + CeedQFunction_Cuda_gen *qf_data; + CeedQFunction qf; + CeedOperatorField *op_input_fields, *op_output_fields; + + CeedCallBackend(CeedGetData(ceed, &cuda_data)); + CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); + CeedCallBackend(CeedQFunctionGetData(qf, &qf_data)); + CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem)); + CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); + CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); + + // Input vectors + for (CeedInt i = 0; i < num_input_fields; i++) { + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); + if (eval_mode == CEED_EVAL_WEIGHT) { // Skip + data->fields.inputs[i] = NULL; + } else { + bool is_active; + CeedVector vec; + + // Get input vector + CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); + is_active = vec == CEED_VECTOR_ACTIVE; + if (is_active) data->fields.inputs[i] = NULL; + else CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->fields.inputs[i])); + CeedCallBackend(CeedVectorDestroy(&vec)); + } + } + + // Point coordinates + { + CeedVector vec; + + CeedCallBackend(CeedOperatorAtPointsGetPoints(op, NULL, &vec)); + CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->points.coords)); + CeedCallBackend(CeedVectorDestroy(&vec)); + + // Points per elem + if (num_elem != data->points.num_elem) { + CeedInt *points_per_elem; + const CeedInt num_bytes = num_elem * sizeof(CeedInt); + CeedElemRestriction rstr_points = NULL; + + data->points.num_elem = num_elem; + CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL)); + CeedCallBackend(CeedCalloc(num_elem, &points_per_elem)); + for (CeedInt e = 0; e < num_elem; e++) { + CeedInt num_points_elem; + + CeedCallBackend(CeedElemRestrictionGetNumPointsInElement(rstr_points, e, &num_points_elem)); + points_per_elem[e] = num_points_elem; } + if (data->points.num_per_elem) CeedCallCuda(ceed, cudaFree((void **)data->points.num_per_elem)); + CeedCallCuda(ceed, cudaMalloc((void **)&data->points.num_per_elem, num_bytes)); + CeedCallCuda(ceed, cudaMemcpy((void *)data->points.num_per_elem, points_per_elem, num_bytes, cudaMemcpyHostToDevice)); + CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points)); + CeedCallBackend(CeedFree(&points_per_elem)); } - if (index == -1) { - CeedCallBackend(CeedVectorRestoreArray(vec, &data->fields.outputs[i])); + } + + // Get context data + CeedCallBackend(CeedQFunctionGetInnerContextData(qf, CEED_MEM_DEVICE, &qf_data->d_c)); + + // Assembly array + CeedCallBackend(CeedVectorGetArray(assembled, CEED_MEM_DEVICE, &assembled_array)); + CeedScalar *assembled_offset_array = &assembled_array[offset]; + + // Assemble diagonal + void *opargs[] = {(void *)&num_elem, &qf_data->d_c, &data->indices, &data->fields, &data->B, + &data->G, &data->W, &data->points, &assembled_offset_array}; + int max_threads_per_block, min_grid_size, grid; + + CeedCallCuda(ceed, cuOccupancyMaxPotentialBlockSize(&min_grid_size, &max_threads_per_block, data->op, dynamicSMemSize, 0, 0x10000)); + int block[3] = {data->thread_1d, (data->dim == 1 ? 1 : data->thread_1d), -1}; + + CeedCallBackend(BlockGridCalculate(num_elem, min_grid_size / cuda_data->device_prop.multiProcessorCount, 1, + cuda_data->device_prop.maxThreadsDim[2], cuda_data->device_prop.warpSize, block, &grid)); + CeedInt shared_mem = block[0] * block[1] * block[2] * sizeof(CeedScalar); + + CeedCallBackend(CeedTryRunKernelDimShared_Cuda(ceed, data->assemble_full, NULL, grid, block[0], block[1], block[2], shared_mem, &is_run_good, + opargs)); + CeedCallCuda(ceed, cudaDeviceSynchronize()); + + // Restore input arrays + for (CeedInt i = 0; i < num_input_fields; i++) { + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); + if (eval_mode == CEED_EVAL_WEIGHT) { // Skip + } else { + bool is_active; + CeedVector vec; + + CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); + is_active = vec == CEED_VECTOR_ACTIVE; + if (!is_active) CeedCallBackend(CeedVectorRestoreArrayRead(vec, &data->fields.inputs[i])); + CeedCallBackend(CeedVectorDestroy(&vec)); } } + + // Restore point coordinates + { + CeedVector vec; + + CeedCallBackend(CeedOperatorAtPointsGetPoints(op, NULL, &vec)); + CeedCallBackend(CeedVectorRestoreArrayRead(vec, &data->points.coords)); + CeedCallBackend(CeedVectorDestroy(&vec)); + } + + // Restore context data + CeedCallBackend(CeedQFunctionRestoreInnerContextData(qf, &qf_data->d_c)); + + // Restore assembly array + CeedCallBackend(CeedVectorRestoreArray(assembled, &assembled_array)); + + // Cleanup + CeedCallBackend(CeedQFunctionDestroy(&qf)); + if (!is_run_good) data->use_assembly_fallback = true; } + CeedCallBackend(CeedDestroy(&ceed)); - // Restore context data - CeedCallBackend(CeedQFunctionRestoreInnerContextData(qf, &qf_data->d_c)); + // Fallback, if needed + if (data->use_assembly_fallback) { + CeedOperator op_fallback; + + CeedDebug(CeedOperatorReturnCeed(op), "\nFalling back to /gpu/cuda/ref CeedOperator for AtPoints SingleOperatorAssemble\n"); + CeedCallBackend(CeedOperatorGetFallback(op, &op_fallback)); + CeedCallBackend(CeedOperatorAssembleSingle(op_fallback, offset, assembled)); + return CEED_ERROR_SUCCESS; + } return CEED_ERROR_SUCCESS; } @@ -246,14 +863,32 @@ static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec, // Create operator //------------------------------------------------------------------------------ int CeedOperatorCreate_Cuda_gen(CeedOperator op) { + bool is_composite, is_at_points; Ceed ceed; CeedOperator_Cuda_gen *impl; CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); CeedCallBackend(CeedCalloc(1, &impl)); CeedCallBackend(CeedOperatorSetData(op, impl)); - CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "ApplyAdd", CeedOperatorApplyAdd_Cuda_gen)); + CeedCall(CeedOperatorIsComposite(op, &is_composite)); + if (is_composite) { + CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "ApplyAddComposite", CeedOperatorApplyAddComposite_Cuda_gen)); + } else { + CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "ApplyAdd", CeedOperatorApplyAdd_Cuda_gen)); + } + CeedCall(CeedOperatorIsAtPoints(op, &is_at_points)); + if (is_at_points) { + CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleAddDiagonal", + CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda_gen)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleSingle", CeedOperatorAssembleSingleAtPoints_Cuda_gen)); + } + if (!is_at_points) { + CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunction", CeedOperatorLinearAssembleQFunction_Cuda_gen)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunctionUpdate", + CeedOperatorLinearAssembleQFunctionUpdate_Cuda_gen)); + } CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "Destroy", CeedOperatorDestroy_Cuda_gen)); + CeedCallBackend(CeedDestroy(&ceed)); return CEED_ERROR_SUCCESS; } diff --git a/backends/cuda-gen/ceed-cuda-gen-qfunction.c b/backends/cuda-gen/ceed-cuda-gen-qfunction.c index ccff67a476..38c5cc9ee1 100644 --- a/backends/cuda-gen/ceed-cuda-gen-qfunction.c +++ b/backends/cuda-gen/ceed-cuda-gen-qfunction.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -27,7 +27,6 @@ static int CeedQFunctionDestroy_Cuda_gen(CeedQFunction qf) { CeedCallBackend(CeedQFunctionGetData(qf, &data)); CeedCallCuda(CeedQFunctionReturnCeed(qf), cudaFree(data->d_c)); - CeedCallBackend(CeedFree(&data->qfunction_source)); CeedCallBackend(CeedFree(&data)); return CEED_ERROR_SUCCESS; } @@ -43,15 +42,11 @@ int CeedQFunctionCreate_Cuda_gen(CeedQFunction qf) { CeedCallBackend(CeedCalloc(1, &data)); CeedCallBackend(CeedQFunctionSetData(qf, data)); - // Read QFunction source CeedCallBackend(CeedQFunctionGetKernelName(qf, &data->qfunction_name)); - CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading QFunction User Source -----\n"); - CeedCallBackend(CeedQFunctionLoadSourceToBuffer(qf, &data->qfunction_source)); - CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading QFunction User Source Complete! -----\n"); - CeedCheck(data->qfunction_source, ceed, CEED_ERROR_UNSUPPORTED, "/gpu/cuda/gen backend requires QFunction source code file"); CeedCallBackend(CeedSetBackendFunction(ceed, "QFunction", qf, "Apply", CeedQFunctionApply_Cuda_gen)); CeedCallBackend(CeedSetBackendFunction(ceed, "QFunction", qf, "Destroy", CeedQFunctionDestroy_Cuda_gen)); + CeedCallBackend(CeedDestroy(&ceed)); return CEED_ERROR_SUCCESS; } diff --git a/backends/cuda-gen/ceed-cuda-gen.c b/backends/cuda-gen/ceed-cuda-gen.c index e1833be2a2..799c35fd1e 100644 --- a/backends/cuda-gen/ceed-cuda-gen.c +++ b/backends/cuda-gen/ceed-cuda-gen.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -18,8 +18,7 @@ //------------------------------------------------------------------------------ static int CeedInit_Cuda_gen(const char *resource, Ceed ceed) { char *resource_root; - const char fallback_resource[] = "/gpu/cuda/ref"; - Ceed ceed_shared; + Ceed ceed_shared, ceed_ref; Ceed_Cuda *data; CeedCallBackend(CeedGetResourceRoot(ceed, resource, ":", &resource_root)); @@ -31,13 +30,18 @@ static int CeedInit_Cuda_gen(const char *resource, Ceed ceed) { CeedCallBackend(CeedSetData(ceed, data)); CeedCallBackend(CeedInit_Cuda(ceed, resource)); - CeedCall(CeedInit("/gpu/cuda/shared", &ceed_shared)); + CeedCallBackend(CeedInit("/gpu/cuda/shared", &ceed_shared)); CeedCallBackend(CeedSetDelegate(ceed, ceed_shared)); + CeedCallBackend(CeedDestroy(&ceed_shared)); - CeedCallBackend(CeedSetOperatorFallbackResource(ceed, fallback_resource)); + CeedCallBackend(CeedInit("/gpu/cuda/ref", &ceed_ref)); + CeedCallBackend(CeedSetOperatorFallbackCeed(ceed, ceed_ref)); + CeedCallBackend(CeedDestroy(&ceed_ref)); CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "QFunctionCreate", CeedQFunctionCreate_Cuda_gen)); CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "OperatorCreate", CeedOperatorCreate_Cuda_gen)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "CompositeOperatorCreate", CeedOperatorCreate_Cuda_gen)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "OperatorCreateAtPoints", CeedOperatorCreate_Cuda_gen)); CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "Destroy", CeedDestroy_Cuda)); return CEED_ERROR_SUCCESS; } diff --git a/backends/cuda-gen/ceed-cuda-gen.h b/backends/cuda-gen/ceed-cuda-gen.h index d10dece242..0e04f3c4e4 100644 --- a/backends/cuda-gen/ceed-cuda-gen.h +++ b/backends/cuda-gen/ceed-cuda-gen.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -12,21 +12,23 @@ #include typedef struct { + bool use_fallback, use_assembly_fallback; CeedInt dim; - CeedInt Q_1d; + CeedInt Q, Q_1d; CeedInt max_P_1d; - CUmodule module; - CUfunction op; + CeedInt thread_1d; + CUmodule module, module_assemble_full, module_assemble_diagonal, module_assemble_qfunction; + CUfunction op, assemble_full, assemble_diagonal, assemble_qfunction; FieldsInt_Cuda indices; Fields_Cuda fields; Fields_Cuda B; Fields_Cuda G; CeedScalar *W; + Points_Cuda points; } CeedOperator_Cuda_gen; typedef struct { const char *qfunction_name; - const char *qfunction_source; void *d_c; } CeedQFunction_Cuda_gen; diff --git a/backends/cuda-ref/ceed-cuda-ref-basis.c b/backends/cuda-ref/ceed-cuda-ref-basis.c index 529c538182..7ed1865a95 100644 --- a/backends/cuda-ref/ceed-cuda-ref-basis.c +++ b/backends/cuda-ref/ceed-cuda-ref-basis.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -10,6 +10,7 @@ #include #include #include +#include #include "../cuda/ceed-cuda-common.h" #include "../cuda/ceed-cuda-compile.h" @@ -18,7 +19,8 @@ //------------------------------------------------------------------------------ // Basis apply - tensor //------------------------------------------------------------------------------ -int CeedBasisApply_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, CeedVector v) { +static int CeedBasisApplyCore_Cuda(CeedBasis basis, bool apply_add, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, + CeedVector u, CeedVector v) { Ceed ceed; CeedInt Q_1d, dim; const CeedInt is_transpose = t_mode == CEED_TRANSPOSE; @@ -33,14 +35,12 @@ int CeedBasisApply_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMo // Get read/write access to u, v if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); - CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); - - // Clear v for transpose operation - if (is_transpose) { - CeedSize length; - - CeedCallBackend(CeedVectorGetLength(v, &length)); - CeedCallCuda(ceed, cudaMemset(d_v, 0, length * sizeof(CeedScalar))); + if (apply_add) { + CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v)); + } else { + // Clear v for transpose operation + if (is_transpose) CeedCallBackend(CeedVectorSetValue(v, 0.0)); + CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); } CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); CeedCallBackend(CeedBasisGetDimension(basis, &dim)); @@ -60,6 +60,7 @@ int CeedBasisApply_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMo CeedCallBackend(CeedRunKernel_Cuda(ceed, data->Grad, num_elem, block_size, grad_args)); } break; case CEED_EVAL_WEIGHT: { + CeedCheck(data->d_q_weight_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weights_1d not set", CeedEvalModes[eval_mode]); void *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight_1d, &d_v}; const int block_size_x = Q_1d; const int block_size_y = dim >= 2 ? Q_1d : 1; @@ -79,14 +80,180 @@ int CeedBasisApply_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMo CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u)); if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); + CeedCallBackend(CeedDestroy(&ceed)); + return CEED_ERROR_SUCCESS; +} + +static int CeedBasisApply_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, + CeedVector v) { + CeedCallBackend(CeedBasisApplyCore_Cuda(basis, false, num_elem, t_mode, eval_mode, u, v)); + return CEED_ERROR_SUCCESS; +} + +static int CeedBasisApplyAdd_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, + CeedVector v) { + CeedCallBackend(CeedBasisApplyCore_Cuda(basis, true, num_elem, t_mode, eval_mode, u, v)); + return CEED_ERROR_SUCCESS; +} + +//------------------------------------------------------------------------------ +// Basis apply - tensor AtPoints +//------------------------------------------------------------------------------ +static int CeedBasisApplyAtPointsCore_Cuda(CeedBasis basis, bool apply_add, const CeedInt num_elem, const CeedInt *num_points, + CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) { + Ceed ceed; + CeedInt Q_1d, dim, max_num_points = num_points[0]; + const CeedInt is_transpose = t_mode == CEED_TRANSPOSE; + const int max_block_size = 32; + const CeedScalar *d_x, *d_u; + CeedScalar *d_v; + CeedBasis_Cuda *data; + + CeedCallBackend(CeedBasisGetData(basis, &data)); + CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); + CeedCallBackend(CeedBasisGetDimension(basis, &dim)); + + // Weight handled separately + if (eval_mode == CEED_EVAL_WEIGHT) { + CeedCallBackend(CeedVectorSetValue(v, 1.0)); + return CEED_ERROR_SUCCESS; + } + + CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); + + // Check padded to uniform number of points per elem + for (CeedInt i = 1; i < num_elem; i++) max_num_points = CeedIntMax(max_num_points, num_points[i]); + { + CeedInt num_comp, q_comp; + CeedSize len, len_required; + + CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); + CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, eval_mode, &q_comp)); + CeedCallBackend(CeedVectorGetLength(is_transpose ? u : v, &len)); + len_required = (CeedSize)num_comp * (CeedSize)q_comp * (CeedSize)num_elem * (CeedSize)max_num_points; + CeedCheck(len >= len_required, ceed, CEED_ERROR_BACKEND, + "Vector at points must be padded to the same number of points in each element for BasisApplyAtPoints on GPU backends." + " Found %" CeedSize_FMT ", Required %" CeedSize_FMT, + len, len_required); + } + + // Move num_points array to device + if (is_transpose) { + const CeedInt num_bytes = num_elem * sizeof(CeedInt); + + if (num_elem != data->num_elem_at_points) { + data->num_elem_at_points = num_elem; + + if (data->d_points_per_elem) CeedCallCuda(ceed, cudaFree(data->d_points_per_elem)); + CeedCallCuda(ceed, cudaMalloc((void **)&data->d_points_per_elem, num_bytes)); + CeedCallBackend(CeedFree(&data->h_points_per_elem)); + CeedCallBackend(CeedCalloc(num_elem, &data->h_points_per_elem)); + } + if (memcmp(data->h_points_per_elem, num_points, num_bytes)) { + memcpy(data->h_points_per_elem, num_points, num_bytes); + CeedCallCuda(ceed, cudaMemcpy(data->d_points_per_elem, num_points, num_bytes, cudaMemcpyHostToDevice)); + } + } + + // Build kernels if needed + if (data->num_points != max_num_points) { + CeedInt P_1d; + + CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); + data->num_points = max_num_points; + + // -- Create interp matrix to Chebyshev coefficients + if (!data->d_chebyshev_interp_1d) { + CeedSize interp_bytes; + CeedScalar *chebyshev_interp_1d; + + interp_bytes = P_1d * Q_1d * sizeof(CeedScalar); + CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d)); + CeedCallBackend(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d)); + CeedCallCuda(ceed, cudaMalloc((void **)&data->d_chebyshev_interp_1d, interp_bytes)); + CeedCallCuda(ceed, cudaMemcpy(data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, cudaMemcpyHostToDevice)); + CeedCallBackend(CeedFree(&chebyshev_interp_1d)); + } + + // -- Compile kernels + const char basis_kernel_source[] = "// AtPoints basis source\n#include \n"; + CeedInt num_comp; + + if (data->moduleAtPoints) CeedCallCuda(ceed, cuModuleUnload(data->moduleAtPoints)); + CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); + CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->moduleAtPoints, 9, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "BASIS_BUF_LEN", + Q_1d * CeedIntPow(Q_1d > P_1d ? Q_1d : P_1d, dim - 1), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp, + "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim), "BASIS_NUM_PTS", + max_num_points, "POINTS_BUFF_LEN", CeedIntPow(Q_1d, dim - 1))); + CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "InterpAtPoints", &data->InterpAtPoints)); + CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "InterpTransposeAtPoints", &data->InterpTransposeAtPoints)); + CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "GradAtPoints", &data->GradAtPoints)); + CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "GradTransposeAtPoints", &data->GradTransposeAtPoints)); + } + + // Get read/write access to u, v + CeedCallBackend(CeedVectorGetArrayRead(x_ref, CEED_MEM_DEVICE, &d_x)); + if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); + else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); + if (apply_add) { + CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v)); + } else { + // Clear v for transpose operation + if (is_transpose) CeedCallBackend(CeedVectorSetValue(v, 0.0)); + CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); + } + + // Basis action + switch (eval_mode) { + case CEED_EVAL_INTERP: { + void *interp_args[] = {(void *)&num_elem, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v}; + const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size); + + CeedCallBackend(CeedRunKernel_Cuda(ceed, is_transpose ? data->InterpTransposeAtPoints : data->InterpAtPoints, num_elem, block_size, + interp_args)); + } break; + case CEED_EVAL_GRAD: { + void *grad_args[] = {(void *)&num_elem, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v}; + const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size); + + CeedCallBackend(CeedRunKernel_Cuda(ceed, is_transpose ? data->GradTransposeAtPoints : data->GradAtPoints, num_elem, block_size, grad_args)); + } break; + case CEED_EVAL_WEIGHT: + case CEED_EVAL_NONE: /* handled separately below */ + break; + // LCOV_EXCL_START + case CEED_EVAL_DIV: + case CEED_EVAL_CURL: + return CeedError(ceed, CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]); + // LCOV_EXCL_STOP + } + + // Restore vectors, cover CEED_EVAL_NONE + CeedCallBackend(CeedVectorRestoreArrayRead(x_ref, &d_x)); + CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); + if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u)); + if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); + CeedCallBackend(CeedDestroy(&ceed)); + return CEED_ERROR_SUCCESS; +} + +static int CeedBasisApplyAtPoints_Cuda(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, + CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) { + CeedCallBackend(CeedBasisApplyAtPointsCore_Cuda(basis, false, num_elem, num_points, t_mode, eval_mode, x_ref, u, v)); + return CEED_ERROR_SUCCESS; +} + +static int CeedBasisApplyAddAtPoints_Cuda(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, + CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) { + CeedCallBackend(CeedBasisApplyAtPointsCore_Cuda(basis, true, num_elem, num_points, t_mode, eval_mode, x_ref, u, v)); return CEED_ERROR_SUCCESS; } //------------------------------------------------------------------------------ // Basis apply - non-tensor //------------------------------------------------------------------------------ -int CeedBasisApplyNonTensor_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, - CeedVector v) { +static int CeedBasisApplyNonTensorCore_Cuda(CeedBasis basis, bool apply_add, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, + CeedVector u, CeedVector v) { Ceed ceed; CeedInt num_nodes, num_qpts; const CeedInt is_transpose = t_mode == CEED_TRANSPOSE; @@ -104,14 +271,12 @@ int CeedBasisApplyNonTensor_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTr // Get read/write access to u, v if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); - CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); - - // Clear v for transpose operation - if (is_transpose) { - CeedSize length; - - CeedCallBackend(CeedVectorGetLength(v, &length)); - CeedCallCuda(ceed, cudaMemset(d_v, 0, length * sizeof(CeedScalar))); + if (apply_add) { + CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v)); + } else { + // Clear v for transpose operation + if (is_transpose) CeedCallBackend(CeedVectorSetValue(v, 0.0)); + CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); } // Apply basis operation @@ -157,6 +322,7 @@ int CeedBasisApplyNonTensor_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTr } } break; case CEED_EVAL_WEIGHT: { + CeedCheck(data->d_q_weight, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weights not set", CeedEvalModes[eval_mode]); void *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight, &d_v}; CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Weight, grid, num_qpts, 1, elems_per_block, weight_args)); @@ -169,6 +335,19 @@ int CeedBasisApplyNonTensor_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTr CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u)); if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); + CeedCallBackend(CeedDestroy(&ceed)); + return CEED_ERROR_SUCCESS; +} + +static int CeedBasisApplyNonTensor_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, + CeedVector v) { + CeedCallBackend(CeedBasisApplyNonTensorCore_Cuda(basis, false, num_elem, t_mode, eval_mode, u, v)); + return CEED_ERROR_SUCCESS; +} + +static int CeedBasisApplyAddNonTensor_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, + CeedVector v) { + CeedCallBackend(CeedBasisApplyNonTensorCore_Cuda(basis, true, num_elem, t_mode, eval_mode, u, v)); return CEED_ERROR_SUCCESS; } @@ -182,10 +361,15 @@ static int CeedBasisDestroy_Cuda(CeedBasis basis) { CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); CeedCallBackend(CeedBasisGetData(basis, &data)); CeedCallCuda(ceed, cuModuleUnload(data->module)); - CeedCallCuda(ceed, cudaFree(data->d_q_weight_1d)); + if (data->moduleAtPoints) CeedCallCuda(ceed, cuModuleUnload(data->moduleAtPoints)); + if (data->d_q_weight_1d) CeedCallCuda(ceed, cudaFree(data->d_q_weight_1d)); + CeedCallBackend(CeedFree(&data->h_points_per_elem)); + if (data->d_points_per_elem) CeedCallCuda(ceed, cudaFree(data->d_points_per_elem)); CeedCallCuda(ceed, cudaFree(data->d_interp_1d)); CeedCallCuda(ceed, cudaFree(data->d_grad_1d)); + CeedCallCuda(ceed, cudaFree(data->d_chebyshev_interp_1d)); CeedCallBackend(CeedFree(&data)); + CeedCallBackend(CeedDestroy(&ceed)); return CEED_ERROR_SUCCESS; } @@ -199,12 +383,13 @@ static int CeedBasisDestroyNonTensor_Cuda(CeedBasis basis) { CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); CeedCallBackend(CeedBasisGetData(basis, &data)); CeedCallCuda(ceed, cuModuleUnload(data->module)); - CeedCallCuda(ceed, cudaFree(data->d_q_weight)); + if (data->d_q_weight) CeedCallCuda(ceed, cudaFree(data->d_q_weight)); CeedCallCuda(ceed, cudaFree(data->d_interp)); CeedCallCuda(ceed, cudaFree(data->d_grad)); CeedCallCuda(ceed, cudaFree(data->d_div)); CeedCallCuda(ceed, cudaFree(data->d_curl)); CeedCallBackend(CeedFree(&data)); + CeedCallBackend(CeedDestroy(&ceed)); return CEED_ERROR_SUCCESS; } @@ -214,8 +399,6 @@ static int CeedBasisDestroyNonTensor_Cuda(CeedBasis basis) { int CeedBasisCreateTensorH1_Cuda(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d, const CeedScalar *q_ref_1d, const CeedScalar *q_weight_1d, CeedBasis basis) { Ceed ceed; - char *basis_kernel_source; - const char *basis_kernel_path; CeedInt num_comp; const CeedInt q_bytes = Q_1d * sizeof(CeedScalar); const CeedInt interp_bytes = q_bytes * P_1d; @@ -225,33 +408,35 @@ int CeedBasisCreateTensorH1_Cuda(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedCallBackend(CeedCalloc(1, &data)); // Copy data to GPU - CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight_1d, q_bytes)); - CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight_1d, q_weight_1d, q_bytes, cudaMemcpyHostToDevice)); + if (q_weight_1d) { + CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight_1d, q_bytes)); + CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight_1d, q_weight_1d, q_bytes, cudaMemcpyHostToDevice)); + } CeedCallCuda(ceed, cudaMalloc((void **)&data->d_interp_1d, interp_bytes)); CeedCallCuda(ceed, cudaMemcpy(data->d_interp_1d, interp_1d, interp_bytes, cudaMemcpyHostToDevice)); CeedCallCuda(ceed, cudaMalloc((void **)&data->d_grad_1d, interp_bytes)); CeedCallCuda(ceed, cudaMemcpy(data->d_grad_1d, grad_1d, interp_bytes, cudaMemcpyHostToDevice)); // Compile basis kernels + const char basis_kernel_source[] = "// Tensor basis source\n#include \n"; + CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); - CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-basis-tensor.h", &basis_kernel_path)); - CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n"); - CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source)); - CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n"); CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->module, 7, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "BASIS_BUF_LEN", - num_comp * CeedIntPow(Q_1d > P_1d ? Q_1d : P_1d, dim), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp, + Q_1d * CeedIntPow(Q_1d > P_1d ? Q_1d : P_1d, dim - 1), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp, "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim))); CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Interp", &data->Interp)); CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Grad", &data->Grad)); CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Weight", &data->Weight)); - CeedCallBackend(CeedFree(&basis_kernel_path)); - CeedCallBackend(CeedFree(&basis_kernel_source)); CeedCallBackend(CeedBasisSetData(basis, data)); // Register backend functions CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApply_Cuda)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAdd_Cuda)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAtPoints", CeedBasisApplyAtPoints_Cuda)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAddAtPoints", CeedBasisApplyAddAtPoints_Cuda)); CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Cuda)); + CeedCallBackend(CeedDestroy(&ceed)); return CEED_ERROR_SUCCESS; } @@ -261,8 +446,6 @@ int CeedBasisCreateTensorH1_Cuda(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const int CeedBasisCreateH1_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, const CeedScalar *grad, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) { Ceed ceed; - char *basis_kernel_source; - const char *basis_kernel_path; CeedInt num_comp, q_comp_interp, q_comp_grad; const CeedInt q_bytes = num_qpts * sizeof(CeedScalar); CeedBasisNonTensor_Cuda *data; @@ -273,8 +456,10 @@ int CeedBasisCreateH1_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes // Copy basis data to GPU CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp)); CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_GRAD, &q_comp_grad)); - CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight, q_bytes)); - CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight, q_weight, q_bytes, cudaMemcpyHostToDevice)); + if (q_weight) { + CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight, q_bytes)); + CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight, q_weight, q_bytes, cudaMemcpyHostToDevice)); + } if (interp) { const CeedInt interp_bytes = q_bytes * num_nodes * q_comp_interp; @@ -289,11 +474,9 @@ int CeedBasisCreateH1_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes } // Compile basis kernels + const char basis_kernel_source[] = "// Nontensor basis source\n#include \n"; + CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); - CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-basis-nontensor.h", &basis_kernel_path)); - CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n"); - CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source)); - CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n"); CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->module, 5, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "BASIS_Q_COMP_INTERP", q_comp_interp, "BASIS_Q_COMP_DERIV", q_comp_grad, "BASIS_NUM_COMP", num_comp)); CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Interp", &data->Interp)); @@ -301,14 +484,14 @@ int CeedBasisCreateH1_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Deriv", &data->Deriv)); CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "DerivTranspose", &data->DerivTranspose)); CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Weight", &data->Weight)); - CeedCallBackend(CeedFree(&basis_kernel_path)); - CeedCallBackend(CeedFree(&basis_kernel_source)); CeedCallBackend(CeedBasisSetData(basis, data)); // Register backend functions CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Cuda)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Cuda)); CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Cuda)); + CeedCallBackend(CeedDestroy(&ceed)); return CEED_ERROR_SUCCESS; } @@ -318,8 +501,6 @@ int CeedBasisCreateH1_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes int CeedBasisCreateHdiv_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, const CeedScalar *div, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) { Ceed ceed; - char *basis_kernel_source; - const char *basis_kernel_path; CeedInt num_comp, q_comp_interp, q_comp_div; const CeedInt q_bytes = num_qpts * sizeof(CeedScalar); CeedBasisNonTensor_Cuda *data; @@ -330,8 +511,10 @@ int CeedBasisCreateHdiv_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nod // Copy basis data to GPU CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp)); CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_DIV, &q_comp_div)); - CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight, q_bytes)); - CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight, q_weight, q_bytes, cudaMemcpyHostToDevice)); + if (q_weight) { + CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight, q_bytes)); + CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight, q_weight, q_bytes, cudaMemcpyHostToDevice)); + } if (interp) { const CeedInt interp_bytes = q_bytes * num_nodes * q_comp_interp; @@ -346,11 +529,9 @@ int CeedBasisCreateHdiv_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nod } // Compile basis kernels + const char basis_kernel_source[] = "// Nontensor basis source\n#include \n"; + CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); - CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-basis-nontensor.h", &basis_kernel_path)); - CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n"); - CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source)); - CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n"); CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->module, 5, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "BASIS_Q_COMP_INTERP", q_comp_interp, "BASIS_Q_COMP_DERIV", q_comp_div, "BASIS_NUM_COMP", num_comp)); CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Interp", &data->Interp)); @@ -358,14 +539,14 @@ int CeedBasisCreateHdiv_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nod CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Deriv", &data->Deriv)); CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "DerivTranspose", &data->DerivTranspose)); CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Weight", &data->Weight)); - CeedCallBackend(CeedFree(&basis_kernel_path)); - CeedCallBackend(CeedFree(&basis_kernel_source)); CeedCallBackend(CeedBasisSetData(basis, data)); // Register backend functions CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Cuda)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Cuda)); CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Cuda)); + CeedCallBackend(CeedDestroy(&ceed)); return CEED_ERROR_SUCCESS; } @@ -375,8 +556,6 @@ int CeedBasisCreateHdiv_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nod int CeedBasisCreateHcurl_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, const CeedScalar *curl, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) { Ceed ceed; - char *basis_kernel_source; - const char *basis_kernel_path; CeedInt num_comp, q_comp_interp, q_comp_curl; const CeedInt q_bytes = num_qpts * sizeof(CeedScalar); CeedBasisNonTensor_Cuda *data; @@ -387,8 +566,10 @@ int CeedBasisCreateHcurl_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_no // Copy basis data to GPU CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp)); CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_CURL, &q_comp_curl)); - CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight, q_bytes)); - CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight, q_weight, q_bytes, cudaMemcpyHostToDevice)); + if (q_weight) { + CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight, q_bytes)); + CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight, q_weight, q_bytes, cudaMemcpyHostToDevice)); + } if (interp) { const CeedInt interp_bytes = q_bytes * num_nodes * q_comp_interp; @@ -403,11 +584,9 @@ int CeedBasisCreateHcurl_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_no } // Compile basis kernels + const char basis_kernel_source[] = "// Nontensor basis source\n#include \n"; + CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); - CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-basis-nontensor.h", &basis_kernel_path)); - CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n"); - CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source)); - CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n"); CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->module, 5, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "BASIS_Q_COMP_INTERP", q_comp_interp, "BASIS_Q_COMP_DERIV", q_comp_curl, "BASIS_NUM_COMP", num_comp)); CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Interp", &data->Interp)); @@ -415,14 +594,14 @@ int CeedBasisCreateHcurl_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_no CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Deriv", &data->Deriv)); CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "DerivTranspose", &data->DerivTranspose)); CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Weight", &data->Weight)); - CeedCallBackend(CeedFree(&basis_kernel_path)); - CeedCallBackend(CeedFree(&basis_kernel_source)); CeedCallBackend(CeedBasisSetData(basis, data)); // Register backend functions CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Cuda)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Cuda)); CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Cuda)); + CeedCallBackend(CeedDestroy(&ceed)); return CEED_ERROR_SUCCESS; } diff --git a/backends/cuda-ref/ceed-cuda-ref-operator.c b/backends/cuda-ref/ceed-cuda-ref-operator.c index 670eb14e12..b4531fde50 100644 --- a/backends/cuda-ref/ceed-cuda-ref-operator.c +++ b/backends/cuda-ref/ceed-cuda-ref-operator.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -27,20 +27,28 @@ static int CeedOperatorDestroy_Cuda(CeedOperator op) { CeedCallBackend(CeedOperatorGetData(op, &impl)); // Apply data - for (CeedInt i = 0; i < impl->num_inputs + impl->num_outputs; i++) { - CeedCallBackend(CeedVectorDestroy(&impl->e_vecs[i])); - } - CeedCallBackend(CeedFree(&impl->e_vecs)); + CeedCallBackend(CeedFree(&impl->num_points)); + CeedCallBackend(CeedFree(&impl->skip_rstr_in)); + CeedCallBackend(CeedFree(&impl->skip_rstr_out)); + CeedCallBackend(CeedFree(&impl->apply_add_basis_out)); + CeedCallBackend(CeedFree(&impl->input_field_order)); + CeedCallBackend(CeedFree(&impl->output_field_order)); + CeedCallBackend(CeedFree(&impl->input_states)); for (CeedInt i = 0; i < impl->num_inputs; i++) { + CeedCallBackend(CeedVectorDestroy(&impl->e_vecs_in[i])); CeedCallBackend(CeedVectorDestroy(&impl->q_vecs_in[i])); } + CeedCallBackend(CeedFree(&impl->e_vecs_in)); CeedCallBackend(CeedFree(&impl->q_vecs_in)); for (CeedInt i = 0; i < impl->num_outputs; i++) { + CeedCallBackend(CeedVectorDestroy(&impl->e_vecs_out[i])); CeedCallBackend(CeedVectorDestroy(&impl->q_vecs_out[i])); } + CeedCallBackend(CeedFree(&impl->e_vecs_out)); CeedCallBackend(CeedFree(&impl->q_vecs_out)); + CeedCallBackend(CeedVectorDestroy(&impl->point_coords_elem)); // QFunction assembly data for (CeedInt i = 0; i < impl->num_active_in; i++) { @@ -70,10 +78,11 @@ static int CeedOperatorDestroy_Cuda(CeedOperator op) { CeedCallCuda(ceed, cudaFree(impl->diag->d_div_out)); CeedCallCuda(ceed, cudaFree(impl->diag->d_curl_in)); CeedCallCuda(ceed, cudaFree(impl->diag->d_curl_out)); - CeedCallBackend(CeedElemRestrictionDestroy(&impl->diag->diag_rstr)); - CeedCallBackend(CeedElemRestrictionDestroy(&impl->diag->point_block_diag_rstr)); + CeedCallBackend(CeedDestroy(&ceed)); CeedCallBackend(CeedVectorDestroy(&impl->diag->elem_diag)); CeedCallBackend(CeedVectorDestroy(&impl->diag->point_block_elem_diag)); + CeedCallBackend(CeedElemRestrictionDestroy(&impl->diag->diag_rstr)); + CeedCallBackend(CeedElemRestrictionDestroy(&impl->diag->point_block_diag_rstr)); } CeedCallBackend(CeedFree(&impl->diag)); @@ -84,6 +93,7 @@ static int CeedOperatorDestroy_Cuda(CeedOperator op) { CeedCallCuda(ceed, cuModuleUnload(impl->asmb->module)); CeedCallCuda(ceed, cudaFree(impl->asmb->d_B_in)); CeedCallCuda(ceed, cudaFree(impl->asmb->d_B_out)); + CeedCallBackend(CeedDestroy(&ceed)); } CeedCallBackend(CeedFree(&impl->asmb)); @@ -94,8 +104,8 @@ static int CeedOperatorDestroy_Cuda(CeedOperator op) { //------------------------------------------------------------------------------ // Setup infields or outfields //------------------------------------------------------------------------------ -static int CeedOperatorSetupFields_Cuda(CeedQFunction qf, CeedOperator op, bool is_input, CeedVector *e_vecs, CeedVector *q_vecs, CeedInt start_e, - CeedInt num_fields, CeedInt Q, CeedInt num_elem) { +static int CeedOperatorSetupFields_Cuda(CeedQFunction qf, CeedOperator op, bool is_input, bool is_at_points, bool *skip_rstr, bool *apply_add_basis, + CeedVector *e_vecs, CeedVector *q_vecs, CeedInt num_fields, CeedInt Q, CeedInt num_elem) { Ceed ceed; CeedQFunctionField *qf_fields; CeedOperatorField *op_fields; @@ -111,68 +121,115 @@ static int CeedOperatorSetupFields_Cuda(CeedQFunction qf, CeedOperator op, bool // Loop over fields for (CeedInt i = 0; i < num_fields; i++) { - bool is_strided = false, skip_restriction = false; - CeedSize q_size; - CeedInt size; - CeedEvalMode eval_mode; - CeedBasis basis; + bool is_active = false, is_strided = false, skip_e_vec = false; + CeedSize q_size; + CeedInt size; + CeedEvalMode eval_mode; + CeedVector l_vec; + CeedElemRestriction elem_rstr; + // Check whether this field can skip the element restriction: + // Input CEED_VECTOR_ACTIVE + // Output CEED_VECTOR_ACTIVE without CEED_EVAL_NONE + // Input CEED_VECTOR_NONE with CEED_EVAL_WEIGHT + // Input passive vector with CEED_EVAL_NONE and strided restriction with CEED_STRIDES_BACKEND + CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &l_vec)); + is_active = l_vec == CEED_VECTOR_ACTIVE; + CeedCallBackend(CeedVectorDestroy(&l_vec)); + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &elem_rstr)); CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode)); - if (eval_mode != CEED_EVAL_WEIGHT) { - CeedElemRestriction elem_rstr; - - // Check whether this field can skip the element restriction: - // Must be passive input, with eval_mode NONE, and have a strided restriction with CEED_STRIDES_BACKEND. - CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &elem_rstr)); - - // First, check whether the field is input or output: - if (is_input) { - CeedVector vec; - - // Check for passive input - CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec)); - if (vec != CEED_VECTOR_ACTIVE) { - // Check eval_mode - if (eval_mode == CEED_EVAL_NONE) { - // Check for strided restriction - CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided)); - if (is_strided) { - // Check if vector is already in preferred backend ordering - CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &skip_restriction)); - } - } - } - } - if (skip_restriction) { - // We do not need an E-Vector, but will use the input field vector's data directly in the operator application. - e_vecs[i + start_e] = NULL; - } else { - CeedCallBackend(CeedElemRestrictionCreateVector(elem_rstr, NULL, &e_vecs[i + start_e])); - } + skip_e_vec = (is_input && is_active) || (is_active && eval_mode != CEED_EVAL_NONE) || (eval_mode == CEED_EVAL_WEIGHT); + if (!skip_e_vec && is_input && !is_active && eval_mode == CEED_EVAL_NONE) { + CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided)); + if (is_strided) CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &skip_e_vec)); } + if (skip_e_vec) { + e_vecs[i] = NULL; + } else { + CeedCallBackend(CeedElemRestrictionCreateVector(elem_rstr, NULL, &e_vecs[i])); + } + CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); switch (eval_mode) { case CEED_EVAL_NONE: - CeedCallBackend(CeedQFunctionFieldGetSize(qf_fields[i], &size)); - q_size = (CeedSize)num_elem * Q * size; - CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i])); - break; case CEED_EVAL_INTERP: case CEED_EVAL_GRAD: case CEED_EVAL_DIV: case CEED_EVAL_CURL: CeedCallBackend(CeedQFunctionFieldGetSize(qf_fields[i], &size)); - q_size = (CeedSize)num_elem * Q * size; + q_size = (CeedSize)num_elem * (CeedSize)Q * (CeedSize)size; CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i])); break; - case CEED_EVAL_WEIGHT: // Only on input fields + case CEED_EVAL_WEIGHT: { + CeedBasis basis; + CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis)); - q_size = (CeedSize)num_elem * Q; + q_size = (CeedSize)num_elem * (CeedSize)Q; CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i])); - CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_NOTRANSPOSE, CEED_EVAL_WEIGHT, CEED_VECTOR_NONE, q_vecs[i])); + if (is_at_points) { + CeedInt num_points[num_elem]; + + for (CeedInt i = 0; i < num_elem; i++) num_points[i] = Q; + CeedCallBackend(CeedBasisApplyAtPoints(basis, num_elem, num_points, CEED_NOTRANSPOSE, CEED_EVAL_WEIGHT, CEED_VECTOR_NONE, CEED_VECTOR_NONE, + q_vecs[i])); + } else { + CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_NOTRANSPOSE, CEED_EVAL_WEIGHT, CEED_VECTOR_NONE, q_vecs[i])); + } + CeedCallBackend(CeedBasisDestroy(&basis)); break; + } + } + } + // Drop duplicate restrictions + if (is_input) { + for (CeedInt i = 0; i < num_fields; i++) { + CeedVector vec_i; + CeedElemRestriction rstr_i; + + CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec_i)); + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &rstr_i)); + for (CeedInt j = i + 1; j < num_fields; j++) { + CeedVector vec_j; + CeedElemRestriction rstr_j; + + CeedCallBackend(CeedOperatorFieldGetVector(op_fields[j], &vec_j)); + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[j], &rstr_j)); + if (vec_i == vec_j && rstr_i == rstr_j) { + if (e_vecs[i]) CeedCallBackend(CeedVectorReferenceCopy(e_vecs[i], &e_vecs[j])); + skip_rstr[j] = true; + } + CeedCallBackend(CeedVectorDestroy(&vec_j)); + CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j)); + } + CeedCallBackend(CeedVectorDestroy(&vec_i)); + CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i)); + } + } else { + for (CeedInt i = num_fields - 1; i >= 0; i--) { + CeedVector vec_i; + CeedElemRestriction rstr_i; + + CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec_i)); + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &rstr_i)); + for (CeedInt j = i - 1; j >= 0; j--) { + CeedVector vec_j; + CeedElemRestriction rstr_j; + + CeedCallBackend(CeedOperatorFieldGetVector(op_fields[j], &vec_j)); + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[j], &rstr_j)); + if (vec_i == vec_j && rstr_i == rstr_j) { + if (e_vecs[i]) CeedCallBackend(CeedVectorReferenceCopy(e_vecs[i], &e_vecs[j])); + skip_rstr[j] = true; + apply_add_basis[i] = true; + } + CeedCallBackend(CeedVectorDestroy(&vec_j)); + CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j)); + } + CeedCallBackend(CeedVectorDestroy(&vec_i)); + CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i)); } } + CeedCallBackend(CeedDestroy(&ceed)); return CEED_ERROR_SUCCESS; } @@ -180,7 +237,6 @@ static int CeedOperatorSetupFields_Cuda(CeedQFunction qf, CeedOperator op, bool // CeedOperator needs to connect all the named fields (be they active or passive) to the named inputs and outputs of its CeedQFunction. //------------------------------------------------------------------------------ static int CeedOperatorSetup_Cuda(CeedOperator op) { - Ceed ceed; bool is_setup_done; CeedInt Q, num_elem, num_input_fields, num_output_fields; CeedQFunctionField *qf_input_fields, *qf_output_fields; @@ -191,7 +247,6 @@ static int CeedOperatorSetup_Cuda(CeedOperator op) { CeedCallBackend(CeedOperatorIsSetupDone(op, &is_setup_done)); if (is_setup_done) return CEED_ERROR_SUCCESS; - CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); CeedCallBackend(CeedOperatorGetData(op, &impl)); CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q)); @@ -200,159 +255,625 @@ static int CeedOperatorSetup_Cuda(CeedOperator op) { CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); // Allocate - CeedCallBackend(CeedCalloc(num_input_fields + num_output_fields, &impl->e_vecs)); - CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->q_vecs_in)); - CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->q_vecs_out)); + CeedCallBackend(CeedCalloc(num_input_fields, &impl->e_vecs_in)); + CeedCallBackend(CeedCalloc(num_output_fields, &impl->e_vecs_out)); + CeedCallBackend(CeedCalloc(num_input_fields, &impl->skip_rstr_in)); + CeedCallBackend(CeedCalloc(num_output_fields, &impl->skip_rstr_out)); + CeedCallBackend(CeedCalloc(num_output_fields, &impl->apply_add_basis_out)); + CeedCallBackend(CeedCalloc(num_input_fields, &impl->input_field_order)); + CeedCallBackend(CeedCalloc(num_output_fields, &impl->output_field_order)); + CeedCallBackend(CeedCalloc(num_input_fields, &impl->input_states)); + CeedCallBackend(CeedCalloc(num_input_fields, &impl->q_vecs_in)); + CeedCallBackend(CeedCalloc(num_output_fields, &impl->q_vecs_out)); impl->num_inputs = num_input_fields; impl->num_outputs = num_output_fields; - // Set up infield and outfield e_vecs and q_vecs - // Infields - CeedCallBackend(CeedOperatorSetupFields_Cuda(qf, op, true, impl->e_vecs, impl->q_vecs_in, 0, num_input_fields, Q, num_elem)); - // Outfields - CeedCallBackend(CeedOperatorSetupFields_Cuda(qf, op, false, impl->e_vecs, impl->q_vecs_out, num_input_fields, num_output_fields, Q, num_elem)); + // Set up infield and outfield e-vecs and q-vecs + CeedCallBackend(CeedOperatorSetupFields_Cuda(qf, op, true, false, impl->skip_rstr_in, NULL, impl->e_vecs_in, impl->q_vecs_in, num_input_fields, Q, + num_elem)); + CeedCallBackend(CeedOperatorSetupFields_Cuda(qf, op, false, false, impl->skip_rstr_out, impl->apply_add_basis_out, impl->e_vecs_out, + impl->q_vecs_out, num_output_fields, Q, num_elem)); + + // Reorder fields to allow reuse of buffers + impl->max_active_e_vec_len = 0; + { + bool is_ordered[CEED_FIELD_MAX]; + CeedInt curr_index = 0; + + for (CeedInt i = 0; i < num_input_fields; i++) is_ordered[i] = false; + for (CeedInt i = 0; i < num_input_fields; i++) { + CeedSize e_vec_len_i; + CeedVector vec_i; + CeedElemRestriction rstr_i; + + if (is_ordered[i]) continue; + is_ordered[i] = true; + impl->input_field_order[curr_index] = i; + curr_index++; + CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i)); + if (vec_i == CEED_VECTOR_NONE) { + // CEED_EVAL_WEIGHT + CeedCallBackend(CeedVectorDestroy(&vec_i)); + continue; + }; + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr_i)); + CeedCallBackend(CeedElemRestrictionGetEVectorSize(rstr_i, &e_vec_len_i)); + impl->max_active_e_vec_len = e_vec_len_i > impl->max_active_e_vec_len ? e_vec_len_i : impl->max_active_e_vec_len; + for (CeedInt j = i + 1; j < num_input_fields; j++) { + CeedVector vec_j; + CeedElemRestriction rstr_j; + + CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[j], &vec_j)); + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[j], &rstr_j)); + if (rstr_i == rstr_j && vec_i == vec_j) { + is_ordered[j] = true; + impl->input_field_order[curr_index] = j; + curr_index++; + } + CeedCallBackend(CeedVectorDestroy(&vec_j)); + CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j)); + } + CeedCallBackend(CeedVectorDestroy(&vec_i)); + CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i)); + } + } + { + bool is_ordered[CEED_FIELD_MAX]; + CeedInt curr_index = 0; + + for (CeedInt i = 0; i < num_output_fields; i++) is_ordered[i] = false; + for (CeedInt i = 0; i < num_output_fields; i++) { + CeedSize e_vec_len_i; + CeedVector vec_i; + CeedElemRestriction rstr_i; + + if (is_ordered[i]) continue; + is_ordered[i] = true; + impl->output_field_order[curr_index] = i; + curr_index++; + CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec_i)); + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &rstr_i)); + CeedCallBackend(CeedElemRestrictionGetEVectorSize(rstr_i, &e_vec_len_i)); + impl->max_active_e_vec_len = e_vec_len_i > impl->max_active_e_vec_len ? e_vec_len_i : impl->max_active_e_vec_len; + for (CeedInt j = i + 1; j < num_output_fields; j++) { + CeedVector vec_j; + CeedElemRestriction rstr_j; + + CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[j], &vec_j)); + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[j], &rstr_j)); + if (rstr_i == rstr_j && vec_i == vec_j) { + is_ordered[j] = true; + impl->output_field_order[curr_index] = j; + curr_index++; + } + CeedCallBackend(CeedVectorDestroy(&vec_j)); + CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j)); + } + CeedCallBackend(CeedVectorDestroy(&vec_i)); + CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i)); + } + } + CeedCallBackend(CeedClearWorkVectors(CeedOperatorReturnCeed(op), impl->max_active_e_vec_len)); + { + // Create two work vectors for diagonal assembly + CeedVector temp_1, temp_2; + CeedCallBackend(CeedGetWorkVector(CeedOperatorReturnCeed(op), impl->max_active_e_vec_len, &temp_1)); + CeedCallBackend(CeedGetWorkVector(CeedOperatorReturnCeed(op), impl->max_active_e_vec_len, &temp_2)); + CeedCallBackend(CeedRestoreWorkVector(CeedOperatorReturnCeed(op), &temp_1)); + CeedCallBackend(CeedRestoreWorkVector(CeedOperatorReturnCeed(op), &temp_2)); + } CeedCallBackend(CeedOperatorSetSetupDone(op)); + CeedCallBackend(CeedQFunctionDestroy(&qf)); return CEED_ERROR_SUCCESS; } //------------------------------------------------------------------------------ -// Setup Operator Inputs +// Restrict Operator Inputs //------------------------------------------------------------------------------ -static inline int CeedOperatorSetupInputs_Cuda(CeedInt num_input_fields, CeedQFunctionField *qf_input_fields, CeedOperatorField *op_input_fields, - CeedVector in_vec, const bool skip_active, CeedScalar *e_data[2 * CEED_FIELD_MAX], - CeedOperator_Cuda *impl, CeedRequest *request) { - for (CeedInt i = 0; i < num_input_fields; i++) { - CeedEvalMode eval_mode; - CeedVector vec; - CeedElemRestriction elem_rstr; +static inline int CeedOperatorInputRestrict_Cuda(CeedOperatorField op_input_field, CeedQFunctionField qf_input_field, CeedInt input_field, + CeedVector in_vec, CeedVector active_e_vec, const bool skip_active, CeedOperator_Cuda *impl, + CeedRequest *request) { + bool is_active = false; + CeedVector l_vec, e_vec = impl->e_vecs_in[input_field]; + + // Get input vector + CeedCallBackend(CeedOperatorFieldGetVector(op_input_field, &l_vec)); + is_active = l_vec == CEED_VECTOR_ACTIVE; + if (is_active && skip_active) return CEED_ERROR_SUCCESS; + if (is_active) { + l_vec = in_vec; + if (!e_vec) e_vec = active_e_vec; + } - // Get input vector - CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); - if (vec == CEED_VECTOR_ACTIVE) { - if (skip_active) continue; - else vec = in_vec; - } + // Restriction action + if (e_vec) { + // Restrict, if necessary + if (!impl->skip_rstr_in[input_field]) { + uint64_t state; - CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); - if (eval_mode == CEED_EVAL_WEIGHT) { // Skip - } else { - // Get input vector - CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); - // Get input element restriction - CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr)); - if (vec == CEED_VECTOR_ACTIVE) vec = in_vec; - // Restrict, if necessary - if (!impl->e_vecs[i]) { - // No restriction for this field; read data directly from vec. - CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, (const CeedScalar **)&e_data[i])); - } else { - CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_NOTRANSPOSE, vec, impl->e_vecs[i], request)); - // Get evec - CeedCallBackend(CeedVectorGetArrayRead(impl->e_vecs[i], CEED_MEM_DEVICE, (const CeedScalar **)&e_data[i])); + CeedCallBackend(CeedVectorGetState(l_vec, &state)); + if (is_active || state != impl->input_states[input_field]) { + CeedElemRestriction elem_rstr; + + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_field, &elem_rstr)); + CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_NOTRANSPOSE, l_vec, e_vec, request)); + CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); } + impl->input_states[input_field] = state; } } + if (!is_active) CeedCallBackend(CeedVectorDestroy(&l_vec)); return CEED_ERROR_SUCCESS; } //------------------------------------------------------------------------------ // Input Basis Action //------------------------------------------------------------------------------ -static inline int CeedOperatorInputBasis_Cuda(CeedInt num_elem, CeedQFunctionField *qf_input_fields, CeedOperatorField *op_input_fields, - CeedInt num_input_fields, const bool skip_active, CeedScalar *e_data[2 * CEED_FIELD_MAX], +static inline int CeedOperatorInputBasis_Cuda(CeedOperatorField op_input_field, CeedQFunctionField qf_input_field, CeedInt input_field, + CeedVector in_vec, CeedVector active_e_vec, CeedInt num_elem, const bool skip_active, CeedOperator_Cuda *impl) { + bool is_active = false; + CeedEvalMode eval_mode; + CeedVector l_vec, e_vec = impl->e_vecs_in[input_field], q_vec = impl->q_vecs_in[input_field]; + + // Skip active input + CeedCallBackend(CeedOperatorFieldGetVector(op_input_field, &l_vec)); + is_active = l_vec == CEED_VECTOR_ACTIVE; + if (is_active && skip_active) return CEED_ERROR_SUCCESS; + if (is_active) { + l_vec = in_vec; + if (!e_vec) e_vec = active_e_vec; + } + + // Basis action + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_field, &eval_mode)); + switch (eval_mode) { + case CEED_EVAL_NONE: { + const CeedScalar *e_vec_array; + + if (e_vec) { + CeedCallBackend(CeedVectorGetArrayRead(e_vec, CEED_MEM_DEVICE, &e_vec_array)); + } else { + CeedCallBackend(CeedVectorGetArrayRead(l_vec, CEED_MEM_DEVICE, &e_vec_array)); + } + CeedCallBackend(CeedVectorSetArray(q_vec, CEED_MEM_DEVICE, CEED_USE_POINTER, (CeedScalar *)e_vec_array)); + break; + } + case CEED_EVAL_INTERP: + case CEED_EVAL_GRAD: + case CEED_EVAL_DIV: + case CEED_EVAL_CURL: { + CeedBasis basis; + + CeedCallBackend(CeedOperatorFieldGetBasis(op_input_field, &basis)); + CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_NOTRANSPOSE, eval_mode, e_vec, q_vec)); + CeedCallBackend(CeedBasisDestroy(&basis)); + break; + } + case CEED_EVAL_WEIGHT: + break; // No action + } + if (!is_active) CeedCallBackend(CeedVectorDestroy(&l_vec)); + return CEED_ERROR_SUCCESS; +} + +//------------------------------------------------------------------------------ +// Restore Input Vectors +//------------------------------------------------------------------------------ +static inline int CeedOperatorInputRestore_Cuda(CeedOperatorField op_input_field, CeedQFunctionField qf_input_field, CeedInt input_field, + CeedVector in_vec, CeedVector active_e_vec, const bool skip_active, CeedOperator_Cuda *impl) { + bool is_active = false; + CeedEvalMode eval_mode; + CeedVector l_vec, e_vec = impl->e_vecs_in[input_field]; + + // Skip active input + CeedCallBackend(CeedOperatorFieldGetVector(op_input_field, &l_vec)); + is_active = l_vec == CEED_VECTOR_ACTIVE; + if (is_active && skip_active) return CEED_ERROR_SUCCESS; + if (is_active) { + l_vec = in_vec; + if (!e_vec) e_vec = active_e_vec; + } + + // Restore e-vec + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_field, &eval_mode)); + if (eval_mode == CEED_EVAL_NONE) { + const CeedScalar *e_vec_array; + + CeedCallBackend(CeedVectorTakeArray(impl->q_vecs_in[input_field], CEED_MEM_DEVICE, (CeedScalar **)&e_vec_array)); + if (e_vec) { + CeedCallBackend(CeedVectorRestoreArrayRead(e_vec, &e_vec_array)); + } else { + CeedCallBackend(CeedVectorRestoreArrayRead(l_vec, &e_vec_array)); + } + } + if (!is_active) CeedCallBackend(CeedVectorDestroy(&l_vec)); + return CEED_ERROR_SUCCESS; +} + +//------------------------------------------------------------------------------ +// Apply and add to output +//------------------------------------------------------------------------------ +static int CeedOperatorApplyAdd_Cuda(CeedOperator op, CeedVector in_vec, CeedVector out_vec, CeedRequest *request) { + CeedInt Q, num_elem, num_input_fields, num_output_fields; + Ceed ceed; + CeedVector active_e_vec; + CeedQFunctionField *qf_input_fields, *qf_output_fields; + CeedQFunction qf; + CeedOperatorField *op_input_fields, *op_output_fields; + CeedOperator_Cuda *impl; + + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); + CeedCallBackend(CeedOperatorGetData(op, &impl)); + CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); + CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q)); + CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem)); + CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); + CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); + + // Setup + CeedCallBackend(CeedOperatorSetup_Cuda(op)); + + // Work vector + CeedCallBackend(CeedGetWorkVector(ceed, impl->max_active_e_vec_len, &active_e_vec)); + + // Process inputs for (CeedInt i = 0; i < num_input_fields; i++) { - CeedInt elem_size, size; - CeedEvalMode eval_mode; - CeedElemRestriction elem_rstr; - CeedBasis basis; + CeedInt field = impl->input_field_order[i]; + + CeedCallBackend(CeedOperatorInputRestrict_Cuda(op_input_fields[field], qf_input_fields[field], field, in_vec, active_e_vec, false, impl, + request)); + CeedCallBackend(CeedOperatorInputBasis_Cuda(op_input_fields[field], qf_input_fields[field], field, in_vec, active_e_vec, num_elem, false, impl)); + } + + // Output pointers, as necessary + for (CeedInt i = 0; i < num_output_fields; i++) { + CeedEvalMode eval_mode; - // Skip active input - if (skip_active) { - CeedVector vec; + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); + if (eval_mode == CEED_EVAL_NONE) { + CeedScalar *e_vec_array; - CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); - if (vec == CEED_VECTOR_ACTIVE) continue; + CeedCallBackend(CeedVectorGetArrayWrite(impl->e_vecs_out[i], CEED_MEM_DEVICE, &e_vec_array)); + CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[i], CEED_MEM_DEVICE, CEED_USE_POINTER, e_vec_array)); } - // Get elem_size, eval_mode, size - CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr)); - CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); - CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); - CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &size)); + } + + // Q function + CeedCallBackend(CeedQFunctionApply(qf, num_elem * Q, impl->q_vecs_in, impl->q_vecs_out)); + + // Restore input arrays + for (CeedInt i = 0; i < num_input_fields; i++) { + CeedCallBackend(CeedOperatorInputRestore_Cuda(op_input_fields[i], qf_input_fields[i], i, in_vec, active_e_vec, false, impl)); + } + + // Output basis and restriction + for (CeedInt i = 0; i < num_output_fields; i++) { + bool is_active = false; + CeedInt field = impl->output_field_order[i]; + CeedEvalMode eval_mode; + CeedVector l_vec, e_vec = impl->e_vecs_out[field], q_vec = impl->q_vecs_out[field]; + + // Output vector + CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[field], &l_vec)); + is_active = l_vec == CEED_VECTOR_ACTIVE; + if (is_active) { + l_vec = out_vec; + if (!e_vec) e_vec = active_e_vec; + } + // Basis action + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[field], &eval_mode)); switch (eval_mode) { case CEED_EVAL_NONE: - CeedCallBackend(CeedVectorSetArray(impl->q_vecs_in[i], CEED_MEM_DEVICE, CEED_USE_POINTER, e_data[i])); - break; + break; // No action case CEED_EVAL_INTERP: case CEED_EVAL_GRAD: case CEED_EVAL_DIV: - case CEED_EVAL_CURL: - CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); - CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_NOTRANSPOSE, eval_mode, impl->e_vecs[i], impl->q_vecs_in[i])); + case CEED_EVAL_CURL: { + CeedBasis basis; + + CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[field], &basis)); + if (impl->apply_add_basis_out[field]) { + CeedCallBackend(CeedBasisApplyAdd(basis, num_elem, CEED_TRANSPOSE, eval_mode, q_vec, e_vec)); + } else { + CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_TRANSPOSE, eval_mode, q_vec, e_vec)); + } + CeedCallBackend(CeedBasisDestroy(&basis)); break; - case CEED_EVAL_WEIGHT: - break; // No action + } + // LCOV_EXCL_START + case CEED_EVAL_WEIGHT: { + return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); + // LCOV_EXCL_STOP + } } + + // Restore evec + if (eval_mode == CEED_EVAL_NONE) { + CeedScalar *e_vec_array; + + CeedCallBackend(CeedVectorTakeArray(impl->q_vecs_out[field], CEED_MEM_DEVICE, &e_vec_array)); + CeedCallBackend(CeedVectorRestoreArray(e_vec, &e_vec_array)); + } + + // Restrict + if (!impl->skip_rstr_out[field]) { + CeedElemRestriction elem_rstr; + + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[field], &elem_rstr)); + CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_TRANSPOSE, e_vec, l_vec, request)); + CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); + } + if (!is_active) CeedCallBackend(CeedVectorDestroy(&l_vec)); } + + // Return work vector + CeedCallBackend(CeedRestoreWorkVector(ceed, &active_e_vec)); + CeedCallBackend(CeedDestroy(&ceed)); + CeedCallBackend(CeedQFunctionDestroy(&qf)); return CEED_ERROR_SUCCESS; } //------------------------------------------------------------------------------ -// Restore Input Vectors +// CeedOperator needs to connect all the named fields (be they active or passive) to the named inputs and outputs of its CeedQFunction. //------------------------------------------------------------------------------ -static inline int CeedOperatorRestoreInputs_Cuda(CeedInt num_input_fields, CeedQFunctionField *qf_input_fields, CeedOperatorField *op_input_fields, - const bool skip_active, CeedScalar *e_data[2 * CEED_FIELD_MAX], CeedOperator_Cuda *impl) { - for (CeedInt i = 0; i < num_input_fields; i++) { - CeedEvalMode eval_mode; - CeedVector vec; +static int CeedOperatorSetupAtPoints_Cuda(CeedOperator op) { + bool is_setup_done; + CeedInt max_num_points = -1, num_elem, num_input_fields, num_output_fields; + CeedQFunctionField *qf_input_fields, *qf_output_fields; + CeedQFunction qf; + CeedOperatorField *op_input_fields, *op_output_fields; + CeedOperator_Cuda *impl; + + CeedCallBackend(CeedOperatorIsSetupDone(op, &is_setup_done)); + if (is_setup_done) return CEED_ERROR_SUCCESS; + + CeedCallBackend(CeedOperatorGetData(op, &impl)); + CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); + CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem)); + CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); + CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); + { + CeedElemRestriction rstr_points = NULL; + + CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL)); + CeedCallBackend(CeedElemRestrictionGetMaxPointsInElement(rstr_points, &max_num_points)); + CeedCallBackend(CeedCalloc(num_elem, &impl->num_points)); + for (CeedInt e = 0; e < num_elem; e++) { + CeedInt num_points_elem; - // Skip active input - if (skip_active) { - CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); - if (vec == CEED_VECTOR_ACTIVE) continue; + CeedCallBackend(CeedElemRestrictionGetNumPointsInElement(rstr_points, e, &num_points_elem)); + impl->num_points[e] = num_points_elem; } - CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); - if (eval_mode == CEED_EVAL_WEIGHT) { // Skip - } else { - if (!impl->e_vecs[i]) { // This was a skip_restriction case - CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); - CeedCallBackend(CeedVectorRestoreArrayRead(vec, (const CeedScalar **)&e_data[i])); + CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points)); + } + impl->max_num_points = max_num_points; + + // Allocate + CeedCallBackend(CeedCalloc(num_input_fields, &impl->e_vecs_in)); + CeedCallBackend(CeedCalloc(num_output_fields, &impl->e_vecs_out)); + CeedCallBackend(CeedCalloc(num_input_fields, &impl->skip_rstr_in)); + CeedCallBackend(CeedCalloc(num_output_fields, &impl->skip_rstr_out)); + CeedCallBackend(CeedCalloc(num_output_fields, &impl->apply_add_basis_out)); + CeedCallBackend(CeedCalloc(num_input_fields, &impl->input_field_order)); + CeedCallBackend(CeedCalloc(num_output_fields, &impl->output_field_order)); + CeedCallBackend(CeedCalloc(num_input_fields, &impl->input_states)); + CeedCallBackend(CeedCalloc(num_input_fields, &impl->q_vecs_in)); + CeedCallBackend(CeedCalloc(num_output_fields, &impl->q_vecs_out)); + impl->num_inputs = num_input_fields; + impl->num_outputs = num_output_fields; + + // Set up infield and outfield e-vecs and q-vecs + CeedCallBackend(CeedOperatorSetupFields_Cuda(qf, op, true, true, impl->skip_rstr_in, NULL, impl->e_vecs_in, impl->q_vecs_in, num_input_fields, + max_num_points, num_elem)); + CeedCallBackend(CeedOperatorSetupFields_Cuda(qf, op, false, true, impl->skip_rstr_out, impl->apply_add_basis_out, impl->e_vecs_out, + impl->q_vecs_out, num_output_fields, max_num_points, num_elem)); + + // Reorder fields to allow reuse of buffers + impl->max_active_e_vec_len = 0; + { + bool is_ordered[CEED_FIELD_MAX]; + CeedInt curr_index = 0; + + for (CeedInt i = 0; i < num_input_fields; i++) is_ordered[i] = false; + for (CeedInt i = 0; i < num_input_fields; i++) { + CeedSize e_vec_len_i; + CeedVector vec_i; + CeedElemRestriction rstr_i; + + if (is_ordered[i]) continue; + is_ordered[i] = true; + impl->input_field_order[curr_index] = i; + curr_index++; + CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i)); + if (vec_i == CEED_VECTOR_NONE) { + // CEED_EVAL_WEIGHT + CeedCallBackend(CeedVectorDestroy(&vec_i)); + continue; + }; + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr_i)); + CeedCallBackend(CeedElemRestrictionGetEVectorSize(rstr_i, &e_vec_len_i)); + impl->max_active_e_vec_len = e_vec_len_i > impl->max_active_e_vec_len ? e_vec_len_i : impl->max_active_e_vec_len; + for (CeedInt j = i + 1; j < num_input_fields; j++) { + CeedVector vec_j; + CeedElemRestriction rstr_j; + + CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[j], &vec_j)); + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[j], &rstr_j)); + if (rstr_i == rstr_j && vec_i == vec_j) { + is_ordered[j] = true; + impl->input_field_order[curr_index] = j; + curr_index++; + } + CeedCallBackend(CeedVectorDestroy(&vec_j)); + CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j)); + } + CeedCallBackend(CeedVectorDestroy(&vec_i)); + CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i)); + } + } + { + bool is_ordered[CEED_FIELD_MAX]; + CeedInt curr_index = 0; + + for (CeedInt i = 0; i < num_output_fields; i++) is_ordered[i] = false; + for (CeedInt i = 0; i < num_output_fields; i++) { + CeedSize e_vec_len_i; + CeedVector vec_i; + CeedElemRestriction rstr_i; + + if (is_ordered[i]) continue; + is_ordered[i] = true; + impl->output_field_order[curr_index] = i; + curr_index++; + CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec_i)); + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &rstr_i)); + CeedCallBackend(CeedElemRestrictionGetEVectorSize(rstr_i, &e_vec_len_i)); + impl->max_active_e_vec_len = e_vec_len_i > impl->max_active_e_vec_len ? e_vec_len_i : impl->max_active_e_vec_len; + for (CeedInt j = i + 1; j < num_output_fields; j++) { + CeedVector vec_j; + CeedElemRestriction rstr_j; + + CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[j], &vec_j)); + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[j], &rstr_j)); + if (rstr_i == rstr_j && vec_i == vec_j) { + is_ordered[j] = true; + impl->output_field_order[curr_index] = j; + curr_index++; + } + CeedCallBackend(CeedVectorDestroy(&vec_j)); + CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j)); + } + CeedCallBackend(CeedVectorDestroy(&vec_i)); + CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i)); + } + } + CeedCallBackend(CeedClearWorkVectors(CeedOperatorReturnCeed(op), impl->max_active_e_vec_len)); + { + // Create two work vectors for diagonal assembly + CeedVector temp_1, temp_2; + + CeedCallBackend(CeedGetWorkVector(CeedOperatorReturnCeed(op), impl->max_active_e_vec_len, &temp_1)); + CeedCallBackend(CeedGetWorkVector(CeedOperatorReturnCeed(op), impl->max_active_e_vec_len, &temp_2)); + CeedCallBackend(CeedRestoreWorkVector(CeedOperatorReturnCeed(op), &temp_1)); + CeedCallBackend(CeedRestoreWorkVector(CeedOperatorReturnCeed(op), &temp_2)); + } + CeedCallBackend(CeedOperatorSetSetupDone(op)); + CeedCallBackend(CeedQFunctionDestroy(&qf)); + return CEED_ERROR_SUCCESS; +} + +//------------------------------------------------------------------------------ +// Input Basis Action AtPoints +//------------------------------------------------------------------------------ +static inline int CeedOperatorInputBasisAtPoints_Cuda(CeedOperatorField op_input_field, CeedQFunctionField qf_input_field, CeedInt input_field, + CeedVector in_vec, CeedVector active_e_vec, CeedInt num_elem, const CeedInt *num_points, + const bool skip_active, const bool skip_passive, CeedOperator_Cuda *impl) { + bool is_active = false; + CeedEvalMode eval_mode; + CeedVector l_vec, e_vec = impl->e_vecs_in[input_field], q_vec = impl->q_vecs_in[input_field]; + + // Skip active input + CeedCallBackend(CeedOperatorFieldGetVector(op_input_field, &l_vec)); + is_active = l_vec == CEED_VECTOR_ACTIVE; + if (skip_active && is_active) return CEED_ERROR_SUCCESS; + if (skip_passive && !is_active) { + CeedCallBackend(CeedVectorDestroy(&l_vec)); + return CEED_ERROR_SUCCESS; + } + if (is_active) { + l_vec = in_vec; + if (!e_vec) e_vec = active_e_vec; + } + + // Basis action + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_field, &eval_mode)); + switch (eval_mode) { + case CEED_EVAL_NONE: { + const CeedScalar *e_vec_array; + + if (e_vec) { + CeedCallBackend(CeedVectorGetArrayRead(e_vec, CEED_MEM_DEVICE, &e_vec_array)); } else { - CeedCallBackend(CeedVectorRestoreArrayRead(impl->e_vecs[i], (const CeedScalar **)&e_data[i])); + CeedCallBackend(CeedVectorGetArrayRead(l_vec, CEED_MEM_DEVICE, &e_vec_array)); } + CeedCallBackend(CeedVectorSetArray(q_vec, CEED_MEM_DEVICE, CEED_USE_POINTER, (CeedScalar *)e_vec_array)); + break; } + case CEED_EVAL_INTERP: + case CEED_EVAL_GRAD: + case CEED_EVAL_DIV: + case CEED_EVAL_CURL: { + CeedBasis basis; + + CeedCallBackend(CeedOperatorFieldGetBasis(op_input_field, &basis)); + CeedCallBackend(CeedBasisApplyAtPoints(basis, num_elem, num_points, CEED_NOTRANSPOSE, eval_mode, impl->point_coords_elem, e_vec, q_vec)); + CeedCallBackend(CeedBasisDestroy(&basis)); + break; + } + case CEED_EVAL_WEIGHT: + break; // No action } + if (!is_active) CeedCallBackend(CeedVectorDestroy(&l_vec)); return CEED_ERROR_SUCCESS; } //------------------------------------------------------------------------------ -// Apply and add to output +// Apply and add to output AtPoints //------------------------------------------------------------------------------ -static int CeedOperatorApplyAdd_Cuda(CeedOperator op, CeedVector in_vec, CeedVector out_vec, CeedRequest *request) { - CeedInt Q, num_elem, elem_size, num_input_fields, num_output_fields, size; - CeedScalar *e_data[2 * CEED_FIELD_MAX] = {NULL}; +static int CeedOperatorApplyAddAtPoints_Cuda(CeedOperator op, CeedVector in_vec, CeedVector out_vec, CeedRequest *request) { + CeedInt max_num_points, *num_points, num_elem, num_input_fields, num_output_fields; + Ceed ceed; + CeedVector active_e_vec; CeedQFunctionField *qf_input_fields, *qf_output_fields; CeedQFunction qf; CeedOperatorField *op_input_fields, *op_output_fields; CeedOperator_Cuda *impl; + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); CeedCallBackend(CeedOperatorGetData(op, &impl)); CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); - CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q)); CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem)); CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); // Setup - CeedCallBackend(CeedOperatorSetup_Cuda(op)); + CeedCallBackend(CeedOperatorSetupAtPoints_Cuda(op)); + num_points = impl->num_points; + max_num_points = impl->max_num_points; - // Input Evecs and Restriction - CeedCallBackend(CeedOperatorSetupInputs_Cuda(num_input_fields, qf_input_fields, op_input_fields, in_vec, false, e_data, impl, request)); + // Work vector + CeedCallBackend(CeedGetWorkVector(ceed, impl->max_active_e_vec_len, &active_e_vec)); - // Input basis apply if needed - CeedCallBackend(CeedOperatorInputBasis_Cuda(num_elem, qf_input_fields, op_input_fields, num_input_fields, false, e_data, impl)); + // Get point coordinates + { + CeedVector point_coords = NULL; + CeedElemRestriction rstr_points = NULL; + + CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, &point_coords)); + if (!impl->point_coords_elem) CeedCallBackend(CeedElemRestrictionCreateVector(rstr_points, NULL, &impl->point_coords_elem)); + { + uint64_t state; + CeedCallBackend(CeedVectorGetState(point_coords, &state)); + if (impl->points_state != state) { + CeedCallBackend(CeedElemRestrictionApply(rstr_points, CEED_NOTRANSPOSE, point_coords, impl->point_coords_elem, request)); + } + } + CeedCallBackend(CeedVectorDestroy(&point_coords)); + CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points)); + } + + // Process inputs + for (CeedInt i = 0; i < num_input_fields; i++) { + CeedInt field = impl->input_field_order[i]; + + CeedCallBackend(CeedOperatorInputRestrict_Cuda(op_input_fields[field], qf_input_fields[field], field, in_vec, active_e_vec, false, impl, + request)); + CeedCallBackend(CeedOperatorInputBasisAtPoints_Cuda(op_input_fields[field], qf_input_fields[field], field, in_vec, active_e_vec, num_elem, + num_points, false, false, impl)); + } // Output pointers, as necessary for (CeedInt i = 0; i < num_output_fields; i++) { @@ -360,68 +881,86 @@ static int CeedOperatorApplyAdd_Cuda(CeedOperator op, CeedVector in_vec, CeedVec CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); if (eval_mode == CEED_EVAL_NONE) { - // Set the output Q-Vector to use the E-Vector data directly. - CeedCallBackend(CeedVectorGetArrayWrite(impl->e_vecs[i + impl->num_inputs], CEED_MEM_DEVICE, &e_data[i + num_input_fields])); - CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[i], CEED_MEM_DEVICE, CEED_USE_POINTER, e_data[i + num_input_fields])); + CeedScalar *e_vec_array; + + CeedCallBackend(CeedVectorGetArrayWrite(impl->e_vecs_out[i], CEED_MEM_DEVICE, &e_vec_array)); + CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[i], CEED_MEM_DEVICE, CEED_USE_POINTER, e_vec_array)); } } // Q function - CeedCallBackend(CeedQFunctionApply(qf, num_elem * Q, impl->q_vecs_in, impl->q_vecs_out)); + CeedCallBackend(CeedQFunctionApply(qf, num_elem * max_num_points, impl->q_vecs_in, impl->q_vecs_out)); + + // Restore input arrays + for (CeedInt i = 0; i < num_input_fields; i++) { + CeedCallBackend(CeedOperatorInputRestore_Cuda(op_input_fields[i], qf_input_fields[i], i, in_vec, active_e_vec, false, impl)); + } - // Output basis apply if needed + // Output basis and restriction for (CeedInt i = 0; i < num_output_fields; i++) { - CeedEvalMode eval_mode; - CeedElemRestriction elem_rstr; - CeedBasis basis; + bool is_active = false; + CeedInt field = impl->output_field_order[i]; + CeedEvalMode eval_mode; + CeedVector l_vec, e_vec = impl->e_vecs_out[field], q_vec = impl->q_vecs_out[field]; + + // Output vector + CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[field], &l_vec)); + is_active = l_vec == CEED_VECTOR_ACTIVE; + if (is_active) { + l_vec = out_vec; + if (!e_vec) e_vec = active_e_vec; + } - // Get elem_size, eval_mode, size - CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr)); - CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); - CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); - CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &size)); // Basis action + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[field], &eval_mode)); switch (eval_mode) { case CEED_EVAL_NONE: break; // No action case CEED_EVAL_INTERP: case CEED_EVAL_GRAD: case CEED_EVAL_DIV: - case CEED_EVAL_CURL: - CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); - CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_TRANSPOSE, eval_mode, impl->q_vecs_out[i], impl->e_vecs[i + impl->num_inputs])); + case CEED_EVAL_CURL: { + CeedBasis basis; + + CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[field], &basis)); + if (impl->apply_add_basis_out[field]) { + CeedCallBackend(CeedBasisApplyAddAtPoints(basis, num_elem, num_points, CEED_TRANSPOSE, eval_mode, impl->point_coords_elem, q_vec, e_vec)); + } else { + CeedCallBackend(CeedBasisApplyAtPoints(basis, num_elem, num_points, CEED_TRANSPOSE, eval_mode, impl->point_coords_elem, q_vec, e_vec)); + } + CeedCallBackend(CeedBasisDestroy(&basis)); break; + } // LCOV_EXCL_START case CEED_EVAL_WEIGHT: { - return CeedError(CeedOperatorReturnCeed(op), CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); + return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); // LCOV_EXCL_STOP } } - } - - // Output restriction - for (CeedInt i = 0; i < num_output_fields; i++) { - CeedEvalMode eval_mode; - CeedVector vec; - CeedElemRestriction elem_rstr; // Restore evec - CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); if (eval_mode == CEED_EVAL_NONE) { - CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs[i + impl->num_inputs], &e_data[i + num_input_fields])); + CeedScalar *e_vec_array; + + CeedCallBackend(CeedVectorTakeArray(impl->q_vecs_out[field], CEED_MEM_DEVICE, &e_vec_array)); + CeedCallBackend(CeedVectorRestoreArray(e_vec, &e_vec_array)); } - // Get output vector - CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec)); + // Restrict - CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr)); - // Active - if (vec == CEED_VECTOR_ACTIVE) vec = out_vec; + if (!impl->skip_rstr_out[field]) { + CeedElemRestriction elem_rstr; - CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_TRANSPOSE, impl->e_vecs[i + impl->num_inputs], vec, request)); + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[field], &elem_rstr)); + CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_TRANSPOSE, e_vec, l_vec, request)); + CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); + } + if (!is_active) CeedCallBackend(CeedVectorDestroy(&l_vec)); } - // Restore input arrays - CeedCallBackend(CeedOperatorRestoreInputs_Cuda(num_input_fields, qf_input_fields, op_input_fields, false, e_data, impl)); + // Restore work vector + CeedCallBackend(CeedRestoreWorkVector(ceed, &active_e_vec)); + CeedCallBackend(CeedDestroy(&ceed)); + CeedCallBackend(CeedQFunctionDestroy(&qf)); return CEED_ERROR_SUCCESS; } @@ -432,7 +971,7 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Cuda(CeedOperator op, CeedRequest *request) { Ceed ceed, ceed_parent; CeedInt num_active_in, num_active_out, Q, num_elem, num_input_fields, num_output_fields, size; - CeedScalar *assembled_array, *e_data[2 * CEED_FIELD_MAX] = {NULL}; + CeedScalar *assembled_array; CeedVector *active_inputs; CeedQFunctionField *qf_input_fields, *qf_output_fields; CeedQFunction qf; @@ -453,19 +992,21 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Cuda(CeedOperator op, // Setup CeedCallBackend(CeedOperatorSetup_Cuda(op)); - // Input Evecs and Restriction - CeedCallBackend(CeedOperatorSetupInputs_Cuda(num_input_fields, qf_input_fields, op_input_fields, NULL, true, e_data, impl, request)); + // Process inputs + for (CeedInt i = 0; i < num_input_fields; i++) { + CeedCallBackend(CeedOperatorInputRestrict_Cuda(op_input_fields[i], qf_input_fields[i], i, NULL, NULL, true, impl, request)); + CeedCallBackend(CeedOperatorInputBasis_Cuda(op_input_fields[i], qf_input_fields[i], i, NULL, NULL, num_elem, true, impl)); + } // Count number of active input fields if (!num_active_in) { for (CeedInt i = 0; i < num_input_fields; i++) { CeedScalar *q_vec_array; - CeedVector vec; + CeedVector l_vec; - // Get input vector - CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); // Check if active input - if (vec == CEED_VECTOR_ACTIVE) { + CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &l_vec)); + if (l_vec == CEED_VECTOR_ACTIVE) { CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &size)); CeedCallBackend(CeedVectorSetValue(impl->q_vecs_in[i], 0.0)); CeedCallBackend(CeedVectorGetArray(impl->q_vecs_in[i], CEED_MEM_DEVICE, &q_vec_array)); @@ -474,12 +1015,13 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Cuda(CeedOperator op, CeedSize q_size = (CeedSize)Q * num_elem; CeedCallBackend(CeedVectorCreate(ceed, q_size, &active_inputs[num_active_in + field])); - CeedCallBackend( - CeedVectorSetArray(active_inputs[num_active_in + field], CEED_MEM_DEVICE, CEED_USE_POINTER, &q_vec_array[field * Q * num_elem])); + CeedCallBackend(CeedVectorSetArray(active_inputs[num_active_in + field], CEED_MEM_DEVICE, CEED_USE_POINTER, + &q_vec_array[field * Q * num_elem])); } num_active_in += size; CeedCallBackend(CeedVectorRestoreArray(impl->q_vecs_in[i], &q_vec_array)); } + CeedCallBackend(CeedVectorDestroy(&l_vec)); } impl->num_active_in = num_active_in; impl->qf_active_in = active_inputs; @@ -488,15 +1030,15 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Cuda(CeedOperator op, // Count number of active output fields if (!num_active_out) { for (CeedInt i = 0; i < num_output_fields; i++) { - CeedVector vec; + CeedVector l_vec; - // Get output vector - CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec)); // Check if active output - if (vec == CEED_VECTOR_ACTIVE) { + CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &l_vec)); + if (l_vec == CEED_VECTOR_ACTIVE) { CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &size)); num_active_out += size; } + CeedCallBackend(CeedVectorDestroy(&l_vec)); } impl->num_active_out = num_active_out; } @@ -511,16 +1053,14 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Cuda(CeedOperator op, // Create output restriction CeedCallBackend(CeedElemRestrictionCreateStrided(ceed_parent, num_elem, Q, num_active_in * num_active_out, - num_active_in * num_active_out * num_elem * Q, strides, rstr)); + (CeedSize)num_active_in * (CeedSize)num_active_out * (CeedSize)num_elem * (CeedSize)Q, strides, + rstr)); // Create assembled vector CeedCallBackend(CeedVectorCreate(ceed_parent, l_size, assembled)); } CeedCallBackend(CeedVectorSetValue(*assembled, 0.0)); CeedCallBackend(CeedVectorGetArray(*assembled, CEED_MEM_DEVICE, &assembled_array)); - // Input basis apply - CeedCallBackend(CeedOperatorInputBasis_Cuda(num_elem, qf_input_fields, op_input_fields, num_input_fields, true, e_data, impl)); - // Assemble QFunction for (CeedInt in = 0; in < num_active_in; in++) { // Set Inputs @@ -530,38 +1070,42 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Cuda(CeedOperator op, } // Set Outputs for (CeedInt out = 0; out < num_output_fields; out++) { - CeedVector vec; + CeedVector l_vec; - // Get output vector - CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[out], &vec)); // Check if active output - if (vec == CEED_VECTOR_ACTIVE) { + CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[out], &l_vec)); + if (l_vec == CEED_VECTOR_ACTIVE) { CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[out], CEED_MEM_DEVICE, CEED_USE_POINTER, assembled_array)); CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[out], &size)); assembled_array += size * Q * num_elem; // Advance the pointer by the size of the output } + CeedCallBackend(CeedVectorDestroy(&l_vec)); } // Apply QFunction CeedCallBackend(CeedQFunctionApply(qf, Q * num_elem, impl->q_vecs_in, impl->q_vecs_out)); } - // Un-set output q_vecs to prevent accidental overwrite of Assembled + // Un-set output q-vecs to prevent accidental overwrite of Assembled for (CeedInt out = 0; out < num_output_fields; out++) { - CeedVector vec; + CeedVector l_vec; - // Get output vector - CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[out], &vec)); - // Check if active output - if (vec == CEED_VECTOR_ACTIVE) { + CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[out], &l_vec)); + if (l_vec == CEED_VECTOR_ACTIVE) { CeedCallBackend(CeedVectorTakeArray(impl->q_vecs_out[out], CEED_MEM_DEVICE, NULL)); } + CeedCallBackend(CeedVectorDestroy(&l_vec)); } // Restore input arrays - CeedCallBackend(CeedOperatorRestoreInputs_Cuda(num_input_fields, qf_input_fields, op_input_fields, true, e_data, impl)); + for (CeedInt i = 0; i < num_input_fields; i++) { + CeedCallBackend(CeedOperatorInputRestore_Cuda(op_input_fields[i], qf_input_fields[i], i, NULL, NULL, true, impl)); + } // Restore output CeedCallBackend(CeedVectorRestoreArray(*assembled, &assembled_array)); + CeedCallBackend(CeedDestroy(&ceed)); + CeedCallBackend(CeedDestroy(&ceed_parent)); + CeedCallBackend(CeedQFunctionDestroy(&qf)); return CEED_ERROR_SUCCESS; } @@ -605,13 +1149,14 @@ static inline int CeedOperatorAssembleDiagonalSetup_Cuda(CeedOperator op) { CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec)); if (vec == CEED_VECTOR_ACTIVE) { - CeedBasis basis; CeedEvalMode eval_mode; + CeedBasis basis; CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis)); CeedCheck(!basis_in || basis_in == basis, ceed, CEED_ERROR_BACKEND, "Backend does not implement operator diagonal assembly with multiple active bases"); - basis_in = basis; + if (!basis_in) CeedCallBackend(CeedBasisReferenceCopy(basis, &basis_in)); + CeedCallBackend(CeedBasisDestroy(&basis)); CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode)); CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis_in, eval_mode, &q_comp)); if (eval_mode != CEED_EVAL_WEIGHT) { @@ -621,6 +1166,7 @@ static inline int CeedOperatorAssembleDiagonalSetup_Cuda(CeedOperator op) { num_eval_modes_in += q_comp; } } + CeedCallBackend(CeedVectorDestroy(&vec)); } // Determine active output basis @@ -637,7 +1183,8 @@ static inline int CeedOperatorAssembleDiagonalSetup_Cuda(CeedOperator op) { CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis)); CeedCheck(!basis_out || basis_out == basis, ceed, CEED_ERROR_BACKEND, "Backend does not implement operator diagonal assembly with multiple active bases"); - basis_out = basis; + if (!basis_out) CeedCallBackend(CeedBasisReferenceCopy(basis, &basis_out)); + CeedCallBackend(CeedBasisDestroy(&basis)); CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode)); CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis_out, eval_mode, &q_comp)); if (eval_mode != CEED_EVAL_WEIGHT) { @@ -647,6 +1194,7 @@ static inline int CeedOperatorAssembleDiagonalSetup_Cuda(CeedOperator op) { num_eval_modes_out += q_comp; } } + CeedCallBackend(CeedVectorDestroy(&vec)); } // Operator data struct @@ -758,6 +1306,10 @@ static inline int CeedOperatorAssembleDiagonalSetup_Cuda(CeedOperator op) { CeedCallCuda(ceed, cudaMemcpy(diag->d_eval_modes_out, eval_modes_out, num_eval_modes_out * eval_modes_bytes, cudaMemcpyHostToDevice)); CeedCallBackend(CeedFree(&eval_modes_in)); CeedCallBackend(CeedFree(&eval_modes_out)); + CeedCallBackend(CeedDestroy(&ceed)); + CeedCallBackend(CeedBasisDestroy(&basis_in)); + CeedCallBackend(CeedBasisDestroy(&basis_out)); + CeedCallBackend(CeedQFunctionDestroy(&qf)); return CEED_ERROR_SUCCESS; } @@ -766,8 +1318,6 @@ static inline int CeedOperatorAssembleDiagonalSetup_Cuda(CeedOperator op) { //------------------------------------------------------------------------------ static inline int CeedOperatorAssembleDiagonalSetupCompile_Cuda(CeedOperator op, CeedInt use_ceedsize_idx, const bool is_point_block) { Ceed ceed; - char *diagonal_kernel_source; - const char *diagonal_kernel_path; CeedInt num_input_fields, num_output_fields, num_eval_modes_in = 0, num_eval_modes_out = 0; CeedInt num_comp, q_comp, num_nodes, num_qpts; CeedBasis basis_in = NULL, basis_out = NULL; @@ -789,14 +1339,18 @@ static inline int CeedOperatorAssembleDiagonalSetupCompile_Cuda(CeedOperator op, CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec)); if (vec == CEED_VECTOR_ACTIVE) { CeedEvalMode eval_mode; + CeedBasis basis; - CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis_in)); + CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis)); + if (!basis_in) CeedCallBackend(CeedBasisReferenceCopy(basis, &basis_in)); + CeedCallBackend(CeedBasisDestroy(&basis)); CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode)); CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis_in, eval_mode, &q_comp)); if (eval_mode != CEED_EVAL_WEIGHT) { num_eval_modes_in += q_comp; } } + CeedCallBackend(CeedVectorDestroy(&vec)); } // Determine active output basis @@ -808,14 +1362,18 @@ static inline int CeedOperatorAssembleDiagonalSetupCompile_Cuda(CeedOperator op, CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec)); if (vec == CEED_VECTOR_ACTIVE) { CeedEvalMode eval_mode; + CeedBasis basis; - CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis_out)); + CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis)); + if (!basis_out) CeedCallBackend(CeedBasisReferenceCopy(basis, &basis_out)); + CeedCallBackend(CeedBasisDestroy(&basis)); CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode)); CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis_out, eval_mode, &q_comp)); if (eval_mode != CEED_EVAL_WEIGHT) { num_eval_modes_out += q_comp; } } + CeedCallBackend(CeedVectorDestroy(&vec)); } // Operator data struct @@ -823,22 +1381,22 @@ static inline int CeedOperatorAssembleDiagonalSetupCompile_Cuda(CeedOperator op, CeedOperatorDiag_Cuda *diag = impl->diag; // Assemble kernel - CUmodule *module = is_point_block ? &diag->module_point_block : &diag->module; - CeedInt elems_per_block = 1; + const char diagonal_kernel_source[] = "// Diagonal assembly source\n#include \n"; + CUmodule *module = is_point_block ? &diag->module_point_block : &diag->module; + CeedInt elems_per_block = 1; + CeedCallBackend(CeedBasisGetNumNodes(basis_in, &num_nodes)); CeedCallBackend(CeedBasisGetNumComponents(basis_in, &num_comp)); if (basis_in == CEED_BASIS_NONE) num_qpts = num_nodes; else CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis_in, &num_qpts)); - CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-operator-assemble-diagonal.h", &diagonal_kernel_path)); - CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Diagonal Assembly Kernel Source -----\n"); - CeedCallBackend(CeedLoadSourceToBuffer(ceed, diagonal_kernel_path, &diagonal_kernel_source)); - CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Diagonal Assembly Source Complete! -----\n"); CeedCallCuda(ceed, CeedCompile_Cuda(ceed, diagonal_kernel_source, module, 8, "NUM_EVAL_MODES_IN", num_eval_modes_in, "NUM_EVAL_MODES_OUT", num_eval_modes_out, "NUM_COMP", num_comp, "NUM_NODES", num_nodes, "NUM_QPTS", num_qpts, "USE_CEEDSIZE", use_ceedsize_idx, "USE_POINT_BLOCK", is_point_block ? 1 : 0, "BLOCK_SIZE", num_nodes * elems_per_block)); CeedCallCuda(ceed, CeedGetKernel_Cuda(ceed, *module, "LinearDiagonal", is_point_block ? &diag->LinearPointBlock : &diag->LinearDiagonal)); - CeedCallBackend(CeedFree(&diagonal_kernel_path)); - CeedCallBackend(CeedFree(&diagonal_kernel_source)); + CeedCallBackend(CeedDestroy(&ceed)); + CeedCallBackend(CeedBasisDestroy(&basis_in)); + CeedCallBackend(CeedBasisDestroy(&basis_out)); + CeedCallBackend(CeedQFunctionDestroy(&qf)); return CEED_ERROR_SUCCESS; } @@ -890,6 +1448,8 @@ static inline int CeedOperatorAssembleDiagonalCore_Cuda(CeedOperator op, CeedVec CeedCallBackend(CeedOperatorCreateActivePointBlockRestriction(rstr_out, &diag->point_block_diag_rstr)); CeedCallBackend(CeedElemRestrictionCreateVector(diag->point_block_diag_rstr, NULL, &diag->point_block_elem_diag)); } + CeedCallBackend(CeedElemRestrictionDestroy(&rstr_in)); + CeedCallBackend(CeedElemRestrictionDestroy(&rstr_out)); diag_rstr = is_point_block ? diag->point_block_diag_rstr : diag->diag_rstr; elem_diag = is_point_block ? diag->point_block_elem_diag : diag->elem_diag; CeedCallBackend(CeedVectorSetValue(elem_diag, 0.0)); @@ -923,6 +1483,7 @@ static inline int CeedOperatorAssembleDiagonalCore_Cuda(CeedOperator op, CeedVec CeedCallBackend(CeedElemRestrictionApply(diag_rstr, CEED_TRANSPOSE, elem_diag, assembled, request)); // Cleanup + CeedCallBackend(CeedDestroy(&ceed)); CeedCallBackend(CeedVectorDestroy(&assembled_qf)); return CEED_ERROR_SUCCESS; } @@ -946,11 +1507,9 @@ static int CeedOperatorLinearAssembleAddPointBlockDiagonal_Cuda(CeedOperator op, //------------------------------------------------------------------------------ // Single Operator Assembly Setup //------------------------------------------------------------------------------ -static int CeedSingleOperatorAssembleSetup_Cuda(CeedOperator op, CeedInt use_ceedsize_idx) { +static int CeedOperatorAssembleSingleSetup_Cuda(CeedOperator op, CeedInt use_ceedsize_idx) { Ceed ceed; Ceed_Cuda *cuda_data; - char *assembly_kernel_source; - const char *assembly_kernel_path; CeedInt num_input_fields, num_output_fields, num_eval_modes_in = 0, num_eval_modes_out = 0; CeedInt elem_size_in, num_qpts_in = 0, num_comp_in, elem_size_out, num_qpts_out, num_comp_out, q_comp; CeedEvalMode *eval_modes_in = NULL, *eval_modes_out = NULL; @@ -975,13 +1534,17 @@ static int CeedSingleOperatorAssembleSetup_Cuda(CeedOperator op, CeedInt use_cee CeedCallBackend(CeedOperatorFieldGetVector(input_fields[i], &vec)); if (vec == CEED_VECTOR_ACTIVE) { - CeedBasis basis; - CeedEvalMode eval_mode; + CeedEvalMode eval_mode; + CeedElemRestriction elem_rstr; + CeedBasis basis; CeedCallBackend(CeedOperatorFieldGetBasis(input_fields[i], &basis)); CeedCheck(!basis_in || basis_in == basis, ceed, CEED_ERROR_BACKEND, "Backend does not implement operator assembly with multiple active bases"); - basis_in = basis; - CeedCallBackend(CeedOperatorFieldGetElemRestriction(input_fields[i], &rstr_in)); + if (!basis_in) CeedCallBackend(CeedBasisReferenceCopy(basis, &basis_in)); + CeedCallBackend(CeedBasisDestroy(&basis)); + CeedCallBackend(CeedOperatorFieldGetElemRestriction(input_fields[i], &elem_rstr)); + if (!rstr_in) CeedCallBackend(CeedElemRestrictionReferenceCopy(elem_rstr, &rstr_in)); + CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); CeedCallBackend(CeedElemRestrictionGetElementSize(rstr_in, &elem_size_in)); if (basis_in == CEED_BASIS_NONE) num_qpts_in = elem_size_in; else CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis_in, &num_qpts_in)); @@ -996,6 +1559,7 @@ static int CeedSingleOperatorAssembleSetup_Cuda(CeedOperator op, CeedInt use_cee num_eval_modes_in += q_comp; } } + CeedCallBackend(CeedVectorDestroy(&vec)); } // Determine active output basis; basis_out and rstr_out only used if same as input, TODO @@ -1005,14 +1569,18 @@ static int CeedSingleOperatorAssembleSetup_Cuda(CeedOperator op, CeedInt use_cee CeedCallBackend(CeedOperatorFieldGetVector(output_fields[i], &vec)); if (vec == CEED_VECTOR_ACTIVE) { - CeedBasis basis; - CeedEvalMode eval_mode; + CeedEvalMode eval_mode; + CeedElemRestriction elem_rstr; + CeedBasis basis; CeedCallBackend(CeedOperatorFieldGetBasis(output_fields[i], &basis)); CeedCheck(!basis_out || basis_out == basis, ceed, CEED_ERROR_BACKEND, "Backend does not implement operator assembly with multiple active bases"); - basis_out = basis; - CeedCallBackend(CeedOperatorFieldGetElemRestriction(output_fields[i], &rstr_out)); + if (!basis_out) CeedCallBackend(CeedBasisReferenceCopy(basis, &basis_out)); + CeedCallBackend(CeedBasisDestroy(&basis)); + CeedCallBackend(CeedOperatorFieldGetElemRestriction(output_fields[i], &elem_rstr)); + if (!rstr_out) CeedCallBackend(CeedElemRestrictionReferenceCopy(elem_rstr, &rstr_out)); + CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); CeedCallBackend(CeedElemRestrictionGetElementSize(rstr_out, &elem_size_out)); if (basis_out == CEED_BASIS_NONE) num_qpts_out = elem_size_out; else CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis_out, &num_qpts_out)); @@ -1029,6 +1597,7 @@ static int CeedSingleOperatorAssembleSetup_Cuda(CeedOperator op, CeedInt use_cee num_eval_modes_out += q_comp; } } + CeedCallBackend(CeedVectorDestroy(&vec)); } CeedCheck(num_eval_modes_in > 0 && num_eval_modes_out > 0, ceed, CEED_ERROR_UNSUPPORTED, "Cannot assemble operator without inputs/outputs"); @@ -1047,20 +1616,16 @@ static int CeedSingleOperatorAssembleSetup_Cuda(CeedOperator op, CeedInt use_cee } // Compile kernels + const char assembly_kernel_source[] = "// Full assembly source\n#include \n"; + CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr_in, &num_comp_in)); CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr_out, &num_comp_out)); - CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-operator-assemble.h", &assembly_kernel_path)); - CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Assembly Kernel Source -----\n"); - CeedCallBackend(CeedLoadSourceToBuffer(ceed, assembly_kernel_path, &assembly_kernel_source)); - CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Assembly Source Complete! -----\n"); CeedCallBackend(CeedCompile_Cuda(ceed, assembly_kernel_source, &asmb->module, 10, "NUM_EVAL_MODES_IN", num_eval_modes_in, "NUM_EVAL_MODES_OUT", num_eval_modes_out, "NUM_COMP_IN", num_comp_in, "NUM_COMP_OUT", num_comp_out, "NUM_NODES_IN", elem_size_in, "NUM_NODES_OUT", elem_size_out, "NUM_QPTS", num_qpts_in, "BLOCK_SIZE", asmb->block_size_x * asmb->block_size_y * asmb->elems_per_block, "BLOCK_SIZE_Y", asmb->block_size_y, "USE_CEEDSIZE", use_ceedsize_idx)); CeedCallBackend(CeedGetKernel_Cuda(ceed, asmb->module, "LinearAssemble", &asmb->LinearAssemble)); - CeedCallBackend(CeedFree(&assembly_kernel_path)); - CeedCallBackend(CeedFree(&assembly_kernel_source)); // Load into B_in, in order that they will be used in eval_modes_in { @@ -1093,11 +1658,9 @@ static int CeedSingleOperatorAssembleSetup_Cuda(CeedOperator op, CeedInt use_cee CeedCallCuda(ceed, cudaMemcpy(&asmb->d_B_in[i * elem_size_in * num_qpts_in], h_B_in, elem_size_in * num_qpts_in * sizeof(CeedScalar), cudaMemcpyHostToDevice)); } - - if (identity) { - CeedCallBackend(CeedFree(&identity)); - } + CeedCallBackend(CeedFree(&identity)); } + CeedCallBackend(CeedFree(&eval_modes_in)); // Load into B_out, in order that they will be used in eval_modes_out { @@ -1130,11 +1693,15 @@ static int CeedSingleOperatorAssembleSetup_Cuda(CeedOperator op, CeedInt use_cee CeedCallCuda(ceed, cudaMemcpy(&asmb->d_B_out[i * elem_size_out * num_qpts_out], h_B_out, elem_size_out * num_qpts_out * sizeof(CeedScalar), cudaMemcpyHostToDevice)); } - - if (identity) { - CeedCallBackend(CeedFree(&identity)); - } + CeedCallBackend(CeedFree(&identity)); } + CeedCallBackend(CeedFree(&eval_modes_out)); + CeedCallBackend(CeedDestroy(&ceed)); + CeedCallBackend(CeedElemRestrictionDestroy(&rstr_in)); + CeedCallBackend(CeedElemRestrictionDestroy(&rstr_out)); + CeedCallBackend(CeedBasisDestroy(&basis_in)); + CeedCallBackend(CeedBasisDestroy(&basis_out)); + CeedCallBackend(CeedQFunctionDestroy(&qf)); return CEED_ERROR_SUCCESS; } @@ -1142,11 +1709,11 @@ static int CeedSingleOperatorAssembleSetup_Cuda(CeedOperator op, CeedInt use_cee // Assemble matrix data for COO matrix of assembled operator. // The sparsity pattern is set by CeedOperatorLinearAssembleSymbolic. // -// Note that this (and other assembly routines) currently assume only one active input restriction/basis per operator (could have multiple basis eval -// modes). +// Note that this (and other assembly routines) currently assume only one active input restriction/basis per operator +// (could have multiple basis eval modes). // TODO: allow multiple active input restrictions/basis objects //------------------------------------------------------------------------------ -static int CeedSingleOperatorAssemble_Cuda(CeedOperator op, CeedInt offset, CeedVector values) { +static int CeedOperatorAssembleSingle_Cuda(CeedOperator op, CeedInt offset, CeedVector values) { Ceed ceed; CeedSize values_length = 0, assembled_qf_length = 0; CeedInt use_ceedsize_idx = 0, num_elem_in, num_elem_out, elem_size_in, elem_size_out; @@ -1172,7 +1739,7 @@ static int CeedSingleOperatorAssemble_Cuda(CeedOperator op, CeedInt offset, Ceed if ((values_length > INT_MAX) || (assembled_qf_length > INT_MAX)) use_ceedsize_idx = 1; // Setup - if (!impl->asmb) CeedCallBackend(CeedSingleOperatorAssembleSetup_Cuda(op, use_ceedsize_idx)); + if (!impl->asmb) CeedCallBackend(CeedOperatorAssembleSingleSetup_Cuda(op, use_ceedsize_idx)); CeedOperatorAssemble_Cuda *asmb = impl->asmb; assert(asmb != NULL); @@ -1218,8 +1785,8 @@ static int CeedSingleOperatorAssemble_Cuda(CeedOperator op, CeedInt offset, Ceed void *args[] = {(void *)&num_elem_in, &asmb->d_B_in, &asmb->d_B_out, &orients_in, &curl_orients_in, &orients_out, &curl_orients_out, &assembled_qf_array, &values_array}; - CeedCallBackend( - CeedRunKernelDimShared_Cuda(ceed, asmb->LinearAssemble, grid, asmb->block_size_x, asmb->block_size_y, asmb->elems_per_block, shared_mem, args)); + CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, asmb->LinearAssemble, NULL, grid, asmb->block_size_x, asmb->block_size_y, asmb->elems_per_block, + shared_mem, args)); // Restore arrays CeedCallBackend(CeedVectorRestoreArray(values, &values_array)); @@ -1239,6 +1806,270 @@ static int CeedSingleOperatorAssemble_Cuda(CeedOperator op, CeedInt offset, Ceed CeedCallBackend(CeedElemRestrictionRestoreCurlOrientations(rstr_out, &curl_orients_out)); } } + CeedCallBackend(CeedDestroy(&ceed)); + CeedCallBackend(CeedElemRestrictionDestroy(&rstr_in)); + CeedCallBackend(CeedElemRestrictionDestroy(&rstr_out)); + return CEED_ERROR_SUCCESS; +} + +//------------------------------------------------------------------------------ +// Assemble Linear QFunction AtPoints +//------------------------------------------------------------------------------ +static int CeedOperatorLinearAssembleQFunctionAtPoints_Cuda(CeedOperator op, CeedVector *assembled, CeedElemRestriction *rstr, CeedRequest *request) { + return CeedError(CeedOperatorReturnCeed(op), CEED_ERROR_BACKEND, "Backend does not implement CeedOperatorLinearAssembleQFunction"); +} + +//------------------------------------------------------------------------------ +// Assemble Linear Diagonal AtPoints +//------------------------------------------------------------------------------ +static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda(CeedOperator op, CeedVector assembled, CeedRequest *request) { + CeedInt max_num_points, *num_points, num_elem, num_input_fields, num_output_fields; + Ceed ceed; + CeedVector active_e_vec_in, active_e_vec_out; + CeedQFunctionField *qf_input_fields, *qf_output_fields; + CeedQFunction qf; + CeedOperatorField *op_input_fields, *op_output_fields; + CeedOperator_Cuda *impl; + + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); + CeedCallBackend(CeedOperatorGetData(op, &impl)); + CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); + CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem)); + CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); + CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); + + // Setup + CeedCallBackend(CeedOperatorSetupAtPoints_Cuda(op)); + num_points = impl->num_points; + max_num_points = impl->max_num_points; + + // Work vector + CeedCallBackend(CeedGetWorkVector(ceed, impl->max_active_e_vec_len, &active_e_vec_in)); + CeedCallBackend(CeedGetWorkVector(ceed, impl->max_active_e_vec_len, &active_e_vec_out)); + { + CeedSize length_in, length_out; + + CeedCallBackend(CeedVectorGetLength(active_e_vec_in, &length_in)); + CeedCallBackend(CeedVectorGetLength(active_e_vec_out, &length_out)); + // Need input e_vec to be longer + if (length_in < length_out) { + CeedVector temp = active_e_vec_in; + + active_e_vec_in = active_e_vec_out; + active_e_vec_out = temp; + } + } + + // Get point coordinates + { + CeedVector point_coords = NULL; + CeedElemRestriction rstr_points = NULL; + + CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, &point_coords)); + if (!impl->point_coords_elem) CeedCallBackend(CeedElemRestrictionCreateVector(rstr_points, NULL, &impl->point_coords_elem)); + { + uint64_t state; + CeedCallBackend(CeedVectorGetState(point_coords, &state)); + if (impl->points_state != state) { + CeedCallBackend(CeedElemRestrictionApply(rstr_points, CEED_NOTRANSPOSE, point_coords, impl->point_coords_elem, request)); + } + } + CeedCallBackend(CeedVectorDestroy(&point_coords)); + CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points)); + } + + // Process inputs + for (CeedInt i = 0; i < num_input_fields; i++) { + CeedCallBackend(CeedOperatorInputRestrict_Cuda(op_input_fields[i], qf_input_fields[i], i, NULL, NULL, true, impl, request)); + CeedCallBackend(CeedOperatorInputBasisAtPoints_Cuda(op_input_fields[i], qf_input_fields[i], i, NULL, NULL, num_elem, num_points, true, false, + impl)); + } + + // Output pointers, as necessary + for (CeedInt i = 0; i < num_output_fields; i++) { + CeedEvalMode eval_mode; + + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); + if (eval_mode == CEED_EVAL_NONE) { + CeedScalar *e_vec_array; + + CeedCallBackend(CeedVectorGetArrayWrite(impl->e_vecs_out[i], CEED_MEM_DEVICE, &e_vec_array)); + CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[i], CEED_MEM_DEVICE, CEED_USE_POINTER, e_vec_array)); + } + } + + // Loop over active fields + for (CeedInt i = 0; i < num_input_fields; i++) { + bool is_active = false, is_active_at_points = true; + CeedInt elem_size = 1, num_comp_active = 1, e_vec_size = 0, field_in = impl->input_field_order[i]; + CeedRestrictionType rstr_type; + CeedVector l_vec; + CeedElemRestriction elem_rstr; + + // -- Skip non-active input + CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[field_in], &l_vec)); + is_active = l_vec == CEED_VECTOR_ACTIVE; + CeedCallBackend(CeedVectorDestroy(&l_vec)); + if (!is_active || impl->skip_rstr_in[field_in]) continue; + + // -- Get active restriction type + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[field_in], &elem_rstr)); + CeedCallBackend(CeedElemRestrictionGetType(elem_rstr, &rstr_type)); + is_active_at_points = rstr_type == CEED_RESTRICTION_POINTS; + if (!is_active_at_points) CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); + else elem_size = max_num_points; + CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp_active)); + CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); + + e_vec_size = elem_size * num_comp_active; + CeedCallBackend(CeedVectorSetValue(active_e_vec_in, 0.0)); + for (CeedInt s = 0; s < e_vec_size; s++) { + CeedVector q_vec = impl->q_vecs_in[field_in]; + + // Update unit vector + { + // Note: E-vec strides are node * (1) + comp * (elem_size * num_elem) + elem * (elem_size) + CeedInt node = (s - 1) % elem_size, comp = (s - 1) / elem_size; + CeedSize start = node * 1 + comp * (elem_size * num_elem); + CeedSize stop = (comp + 1) * (elem_size * num_elem); + + if (s != 0) CeedCallBackend(CeedVectorSetValueStrided(active_e_vec_in, start, stop, elem_size, 0.0)); + + node = s % elem_size, comp = s / elem_size; + start = node * 1 + comp * (elem_size * num_elem); + stop = (comp + 1) * (elem_size * num_elem); + CeedCallBackend(CeedVectorSetValueStrided(active_e_vec_in, start, stop, elem_size, 1.0)); + } + + // Basis action + for (CeedInt j = 0; j < num_input_fields; j++) { + CeedInt field = impl->input_field_order[j]; + + CeedCallBackend(CeedOperatorInputBasisAtPoints_Cuda(op_input_fields[field], qf_input_fields[field], field, NULL, active_e_vec_in, num_elem, + num_points, false, true, impl)); + } + + // Q function + CeedCallBackend(CeedQFunctionApply(qf, num_elem * max_num_points, impl->q_vecs_in, impl->q_vecs_out)); + + // Output basis apply if needed + for (CeedInt j = 0; j < num_output_fields; j++) { + bool is_active = false; + CeedInt elem_size = 0; + CeedInt field_out = impl->output_field_order[j]; + CeedRestrictionType rstr_type; + CeedEvalMode eval_mode; + CeedVector l_vec, e_vec = impl->e_vecs_out[field_out], q_vec = impl->q_vecs_out[field_out]; + CeedElemRestriction elem_rstr; + + // ---- Skip non-active output + CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[field_out], &l_vec)); + is_active = l_vec == CEED_VECTOR_ACTIVE; + CeedCallBackend(CeedVectorDestroy(&l_vec)); + if (!is_active) continue; + if (!e_vec) e_vec = active_e_vec_out; + + // ---- Check if elem size matches + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[field_out], &elem_rstr)); + CeedCallBackend(CeedElemRestrictionGetType(elem_rstr, &rstr_type)); + if (is_active_at_points && rstr_type != CEED_RESTRICTION_POINTS) continue; + if (rstr_type == CEED_RESTRICTION_POINTS) { + CeedCallBackend(CeedElemRestrictionGetMaxPointsInElement(elem_rstr, &elem_size)); + } else { + CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); + } + { + CeedInt num_comp = 0; + + CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); + if (e_vec_size != num_comp * elem_size) continue; + } + + // Basis action + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[field_out], &eval_mode)); + switch (eval_mode) { + case CEED_EVAL_NONE: { + CeedScalar *e_vec_array; + + CeedCallBackend(CeedVectorTakeArray(q_vec, CEED_MEM_DEVICE, &e_vec_array)); + CeedCallBackend(CeedVectorRestoreArray(e_vec, &e_vec_array)); + break; + } + case CEED_EVAL_INTERP: + case CEED_EVAL_GRAD: + case CEED_EVAL_DIV: + case CEED_EVAL_CURL: { + CeedBasis basis; + + CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[field_out], &basis)); + if (impl->apply_add_basis_out[field_out]) { + CeedCallBackend(CeedBasisApplyAddAtPoints(basis, num_elem, num_points, CEED_TRANSPOSE, eval_mode, impl->point_coords_elem, q_vec, + e_vec)); + } else { + CeedCallBackend(CeedBasisApplyAtPoints(basis, num_elem, num_points, CEED_TRANSPOSE, eval_mode, impl->point_coords_elem, q_vec, e_vec)); + } + CeedCallBackend(CeedBasisDestroy(&basis)); + break; + } + // LCOV_EXCL_START + case CEED_EVAL_WEIGHT: { + return CeedError(CeedOperatorReturnCeed(op), CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); + // LCOV_EXCL_STOP + } + } + + // Mask output e-vec + if (impl->skip_rstr_out[field_out]) { + CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); + continue; + } + CeedCallBackend(CeedVectorPointwiseMult(e_vec, active_e_vec_in, e_vec)); + + // Restrict + CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_TRANSPOSE, e_vec, assembled, request)); + CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); + + // Reset q_vec for + if (eval_mode == CEED_EVAL_NONE) { + CeedScalar *e_vec_array; + + CeedCallBackend(CeedVectorGetArrayWrite(e_vec, CEED_MEM_DEVICE, &e_vec_array)); + CeedCallBackend(CeedVectorSetArray(q_vec, CEED_MEM_DEVICE, CEED_USE_POINTER, e_vec_array)); + } + } + + // Reset vec + if (s == e_vec_size - 1 && i != num_input_fields - 1) CeedCallBackend(CeedVectorSetValue(q_vec, 0.0)); + } + } + + // Restore CEED_EVAL_NONE + for (CeedInt i = 0; i < num_output_fields; i++) { + CeedEvalMode eval_mode; + + // Get eval_mode + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); + + // Restore evec + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); + if (eval_mode == CEED_EVAL_NONE) { + CeedScalar *e_vec_array; + + CeedCallBackend(CeedVectorTakeArray(impl->q_vecs_out[i], CEED_MEM_DEVICE, &e_vec_array)); + CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs_out[i], &e_vec_array)); + } + } + + // Restore input arrays + for (CeedInt i = 0; i < num_input_fields; i++) { + CeedCallBackend(CeedOperatorInputRestore_Cuda(op_input_fields[i], qf_input_fields[i], i, NULL, NULL, true, impl)); + } + + // Restore work vector + CeedCallBackend(CeedRestoreWorkVector(ceed, &active_e_vec_in)); + CeedCallBackend(CeedRestoreWorkVector(ceed, &active_e_vec_out)); + CeedCallBackend(CeedDestroy(&ceed)); + CeedCallBackend(CeedQFunctionDestroy(&qf)); return CEED_ERROR_SUCCESS; } @@ -1256,11 +2087,31 @@ int CeedOperatorCreate_Cuda(CeedOperator op) { CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunction", CeedOperatorLinearAssembleQFunction_Cuda)); CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunctionUpdate", CeedOperatorLinearAssembleQFunctionUpdate_Cuda)); CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleAddDiagonal", CeedOperatorLinearAssembleAddDiagonal_Cuda)); - CeedCallBackend( - CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleAddPointBlockDiagonal", CeedOperatorLinearAssembleAddPointBlockDiagonal_Cuda)); - CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleSingle", CeedSingleOperatorAssemble_Cuda)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleAddPointBlockDiagonal", + CeedOperatorLinearAssembleAddPointBlockDiagonal_Cuda)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleSingle", CeedOperatorAssembleSingle_Cuda)); CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "ApplyAdd", CeedOperatorApplyAdd_Cuda)); CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "Destroy", CeedOperatorDestroy_Cuda)); + CeedCallBackend(CeedDestroy(&ceed)); + return CEED_ERROR_SUCCESS; +} + +//------------------------------------------------------------------------------ +// Create operator AtPoints +//------------------------------------------------------------------------------ +int CeedOperatorCreateAtPoints_Cuda(CeedOperator op) { + Ceed ceed; + CeedOperator_Cuda *impl; + + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); + CeedCallBackend(CeedCalloc(1, &impl)); + CeedCallBackend(CeedOperatorSetData(op, impl)); + + CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunction", CeedOperatorLinearAssembleQFunctionAtPoints_Cuda)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleAddDiagonal", CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "ApplyAdd", CeedOperatorApplyAddAtPoints_Cuda)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "Destroy", CeedOperatorDestroy_Cuda)); + CeedCallBackend(CeedDestroy(&ceed)); return CEED_ERROR_SUCCESS; } diff --git a/backends/cuda-ref/ceed-cuda-ref-qfunction-load.cpp b/backends/cuda-ref/ceed-cuda-ref-qfunction-load.cpp index 03ace250fb..82d21af0ac 100644 --- a/backends/cuda-ref/ceed-cuda-ref-qfunction-load.cpp +++ b/backends/cuda-ref/ceed-cuda-ref-qfunction-load.cpp @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -24,42 +24,36 @@ extern "C" int CeedQFunctionBuildKernel_Cuda_ref(CeedQFunction qf) { using std::string; Ceed ceed; - const char *read_write_kernel_path, *read_write_kernel_source; CeedInt num_input_fields, num_output_fields, size; CeedQFunctionField *input_fields, *output_fields; CeedQFunction_Cuda *data; - CeedCallBackend(CeedQFunctionGetCeed(qf, &ceed)); - CeedCallBackend(CeedQFunctionGetData(qf, (void **)&data)); - // QFunction is built + CeedCallBackend(CeedQFunctionGetData(qf, (void **)&data)); if (data->QFunction) return CEED_ERROR_SUCCESS; - CeedCheck(data->qfunction_source, ceed, CEED_ERROR_BACKEND, "No QFunction source or CUfunction provided."); + CeedCallBackend(CeedQFunctionGetCeed(qf, &ceed)); // QFunction kernel generation CeedCallBackend(CeedQFunctionGetFields(qf, &num_input_fields, &input_fields, &num_output_fields, &output_fields)); // Build strings for final kernel - CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-qfunction.h", &read_write_kernel_path)); - CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading QFunction Read/Write Kernel Source -----\n"); - { - char *source; - - CeedCallBackend(CeedLoadSourceToBuffer(ceed, read_write_kernel_path, &source)); - read_write_kernel_source = source; - } - CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading QFunction Read/Write Kernel Source Complete! -----\n"); - string qfunction_source(data->qfunction_source); string qfunction_name(data->qfunction_name); - string read_write(read_write_kernel_source); string kernel_name = "CeedKernelCudaRefQFunction_" + qfunction_name; ostringstream code; - // Defintions - code << read_write; - code << qfunction_source; - code << "\n"; + // Definitions + code << "// QFunction source\n"; + code << "#include \n\n"; + { + const char *source_path; + + CeedCallBackend(CeedQFunctionGetSourcePath(qf, &source_path)); + CeedCheck(source_path, ceed, CEED_ERROR_BACKEND, "No QFunction source or CUfunction provided."); + + code << "// User QFunction source\n"; + code << "#include \"" << source_path << "\"\n\n"; + } code << "extern \"C\" __global__ void " << kernel_name << "(void *ctx, CeedInt Q, Fields_Cuda fields) {\n"; // Inputs @@ -69,7 +63,7 @@ extern "C" int CeedQFunctionBuildKernel_Cuda_ref(CeedQFunction qf) { code << " const CeedInt size_input_" << i << " = " << size << ";\n"; code << " CeedScalar input_" << i << "[size_input_" << i << "];\n"; } - code << " const CeedScalar* inputs[" << num_input_fields << "];\n"; + code << " const CeedScalar *inputs[" << CeedIntMax(num_input_fields, 1) << "];\n"; for (CeedInt i = 0; i < num_input_fields; i++) { code << " inputs[" << i << "] = input_" << i << ";\n"; } @@ -82,7 +76,7 @@ extern "C" int CeedQFunctionBuildKernel_Cuda_ref(CeedQFunction qf) { code << " const CeedInt size_output_" << i << " = " << size << ";\n"; code << " CeedScalar output_" << i << "[size_output_" << i << "];\n"; } - code << " CeedScalar* outputs[" << num_output_fields << "];\n"; + code << " CeedScalar *outputs[" << CeedIntMax(num_output_fields, 1) << "];\n"; for (CeedInt i = 0; i < num_output_fields; i++) { code << " outputs[" << i << "] = output_" << i << ";\n"; } @@ -111,18 +105,10 @@ extern "C" int CeedQFunctionBuildKernel_Cuda_ref(CeedQFunction qf) { code << " }\n"; code << "}\n"; - // View kernel for debugging - CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "Generated QFunction Kernels:\n"); - CeedDebug(ceed, code.str().c_str()); - // Compile kernel CeedCallBackend(CeedCompile_Cuda(ceed, code.str().c_str(), &data->module, 0)); CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, kernel_name.c_str(), &data->QFunction)); - - // Cleanup - CeedCallBackend(CeedFree(&data->qfunction_source)); - CeedCallBackend(CeedFree(&read_write_kernel_path)); - CeedCallBackend(CeedFree(&read_write_kernel_source)); + CeedCallBackend(CeedDestroy(&ceed)); return CEED_ERROR_SUCCESS; } diff --git a/backends/cuda-ref/ceed-cuda-ref-qfunction-load.h b/backends/cuda-ref/ceed-cuda-ref-qfunction-load.h index d8ca4f175b..360b8b9673 100644 --- a/backends/cuda-ref/ceed-cuda-ref-qfunction-load.h +++ b/backends/cuda-ref/ceed-cuda-ref-qfunction-load.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause diff --git a/backends/cuda-ref/ceed-cuda-ref-qfunction.c b/backends/cuda-ref/ceed-cuda-ref-qfunction.c index f52aebb685..ded455665b 100644 --- a/backends/cuda-ref/ceed-cuda-ref-qfunction.c +++ b/backends/cuda-ref/ceed-cuda-ref-qfunction.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -58,6 +58,7 @@ static int CeedQFunctionApply_Cuda(CeedQFunction qf, CeedInt Q, CeedVector *U, C // Restore context CeedCallBackend(CeedQFunctionRestoreInnerContextData(qf, &data->d_c)); + CeedCallBackend(CeedDestroy(&ceed)); return CEED_ERROR_SUCCESS; } @@ -95,16 +96,13 @@ int CeedQFunctionCreate_Cuda(CeedQFunction qf) { CeedCallBackend(CeedCalloc(1, &data)); CeedCallBackend(CeedQFunctionSetData(qf, data)); - // Read QFunction source CeedCallBackend(CeedQFunctionGetKernelName(qf, &data->qfunction_name)); - CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading QFunction User Source -----\n"); - CeedCallBackend(CeedQFunctionLoadSourceToBuffer(qf, &data->qfunction_source)); - CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading QFunction User Source Complete! -----\n"); // Register backend functions CeedCallBackend(CeedSetBackendFunction(ceed, "QFunction", qf, "Apply", CeedQFunctionApply_Cuda)); CeedCallBackend(CeedSetBackendFunction(ceed, "QFunction", qf, "Destroy", CeedQFunctionDestroy_Cuda)); CeedCallBackend(CeedSetBackendFunction(ceed, "QFunction", qf, "SetCUDAUserFunction", CeedQFunctionSetCUDAUserFunction_Cuda)); + CeedCallBackend(CeedDestroy(&ceed)); return CEED_ERROR_SUCCESS; } diff --git a/backends/cuda-ref/ceed-cuda-ref-qfunctioncontext.c b/backends/cuda-ref/ceed-cuda-ref-qfunctioncontext.c index 4257265987..491e658338 100644 --- a/backends/cuda-ref/ceed-cuda-ref-qfunctioncontext.c +++ b/backends/cuda-ref/ceed-cuda-ref-qfunctioncontext.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -37,6 +37,7 @@ static inline int CeedQFunctionContextSyncH2D_Cuda(const CeedQFunctionContext ct impl->d_data = impl->d_data_owned; } CeedCallCuda(ceed, cudaMemcpy(impl->d_data, impl->h_data, ctx_size, cudaMemcpyHostToDevice)); + CeedCallBackend(CeedDestroy(&ceed)); return CEED_ERROR_SUCCESS; } @@ -64,6 +65,7 @@ static inline int CeedQFunctionContextSyncD2H_Cuda(const CeedQFunctionContext ct impl->h_data = impl->h_data_owned; } CeedCallCuda(ceed, cudaMemcpy(impl->h_data, impl->d_data, ctx_size, cudaMemcpyDeviceToHost)); + CeedCallBackend(CeedDestroy(&ceed)); return CEED_ERROR_SUCCESS; } @@ -77,7 +79,9 @@ static inline int CeedQFunctionContextSync_Cuda(const CeedQFunctionContext ctx, case CEED_MEM_DEVICE: return CeedQFunctionContextSyncH2D_Cuda(ctx); } + // LCOV_EXCL_START return CEED_ERROR_UNSUPPORTED; + // LCOV_EXCL_STOP } //------------------------------------------------------------------------------ @@ -205,6 +209,7 @@ static int CeedQFunctionContextSetDataDevice_Cuda(const CeedQFunctionContext ctx impl->d_data = data; break; } + CeedCallBackend(CeedDestroy(&ceed)); return CEED_ERROR_SUCCESS; } @@ -220,7 +225,9 @@ static int CeedQFunctionContextSetData_Cuda(const CeedQFunctionContext ctx, cons case CEED_MEM_DEVICE: return CeedQFunctionContextSetDataDevice_Cuda(ctx, copy_mode, data); } + // LCOV_EXCL_START return CEED_ERROR_UNSUPPORTED; + // LCOV_EXCL_STOP } //------------------------------------------------------------------------------ @@ -335,6 +342,7 @@ int CeedQFunctionContextCreate_Cuda(CeedQFunctionContext ctx) { CeedCallBackend(CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "GetData", CeedQFunctionContextGetData_Cuda)); CeedCallBackend(CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "GetDataRead", CeedQFunctionContextGetDataRead_Cuda)); CeedCallBackend(CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "Destroy", CeedQFunctionContextDestroy_Cuda)); + CeedCallBackend(CeedDestroy(&ceed)); CeedCallBackend(CeedCalloc(1, &impl)); CeedCallBackend(CeedQFunctionContextSetBackendData(ctx, impl)); return CEED_ERROR_SUCCESS; diff --git a/backends/cuda-ref/ceed-cuda-ref-restriction.c b/backends/cuda-ref/ceed-cuda-ref-restriction.c index f253b5413d..f390ec0b4c 100644 --- a/backends/cuda-ref/ceed-cuda-ref-restriction.c +++ b/backends/cuda-ref/ceed-cuda-ref-restriction.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -24,36 +24,34 @@ static inline int CeedElemRestrictionSetupCompile_Cuda(CeedElemRestriction rstr) { Ceed ceed; bool is_deterministic; - char *restriction_kernel_source; - const char *restriction_kernel_path; CeedInt num_elem, num_comp, elem_size, comp_stride; CeedRestrictionType rstr_type; CeedElemRestriction_Cuda *impl; CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); CeedCallBackend(CeedElemRestrictionGetCeed(rstr, &ceed)); + CeedCallBackend(CeedElemRestrictionGetType(rstr, &rstr_type)); CeedCallBackend(CeedElemRestrictionGetNumElements(rstr, &num_elem)); CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr, &num_comp)); - CeedCallBackend(CeedElemRestrictionGetElementSize(rstr, &elem_size)); CeedCallBackend(CeedElemRestrictionGetCompStride(rstr, &comp_stride)); - CeedCallBackend(CeedElemRestrictionGetType(rstr, &rstr_type)); + if (rstr_type == CEED_RESTRICTION_POINTS) { + CeedCallBackend(CeedElemRestrictionGetMaxPointsInElement(rstr, &elem_size)); + } else { + CeedCallBackend(CeedElemRestrictionGetElementSize(rstr, &elem_size)); + } is_deterministic = impl->d_l_vec_indices != NULL; // Compile CUDA kernels switch (rstr_type) { case CEED_RESTRICTION_STRIDED: { - bool has_backend_strides; - CeedInt strides[3] = {1, num_elem * elem_size, elem_size}; + const char restriction_kernel_source[] = "// Strided restriction source\n#include \n"; + bool has_backend_strides; + CeedInt strides[3] = {1, num_elem * elem_size, elem_size}; CeedCallBackend(CeedElemRestrictionHasBackendStrides(rstr, &has_backend_strides)); if (!has_backend_strides) { CeedCallBackend(CeedElemRestrictionGetStrides(rstr, strides)); } - - CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-restriction-strided.h", &restriction_kernel_path)); - CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source -----\n"); - CeedCallBackend(CeedLoadSourceToBuffer(ceed, restriction_kernel_path, &restriction_kernel_source)); - CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source Complete! -----\n"); CeedCallBackend(CeedCompile_Cuda(ceed, restriction_kernel_source, &impl->module, 6, "RSTR_ELEM_SIZE", elem_size, "RSTR_NUM_ELEM", num_elem, "RSTR_NUM_COMP", num_comp, "RSTR_STRIDE_NODES", strides[0], "RSTR_STRIDE_COMP", strides[1], "RSTR_STRIDE_ELEM", strides[2])); @@ -61,27 +59,30 @@ static inline int CeedElemRestrictionSetupCompile_Cuda(CeedElemRestriction rstr) CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "StridedTranspose", &impl->ApplyTranspose)); } break; case CEED_RESTRICTION_STANDARD: { - CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-restriction-offset.h", &restriction_kernel_path)); - CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source -----\n"); - CeedCallBackend(CeedLoadSourceToBuffer(ceed, restriction_kernel_path, &restriction_kernel_source)); - CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source Complete! -----\n"); + const char restriction_kernel_source[] = "// Standard restriction source\n#include \n"; + CeedCallBackend(CeedCompile_Cuda(ceed, restriction_kernel_source, &impl->module, 6, "RSTR_ELEM_SIZE", elem_size, "RSTR_NUM_ELEM", num_elem, "RSTR_NUM_COMP", num_comp, "RSTR_NUM_NODES", impl->num_nodes, "RSTR_COMP_STRIDE", comp_stride, "USE_DETERMINISTIC", is_deterministic ? 1 : 0)); CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OffsetNoTranspose", &impl->ApplyNoTranspose)); CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OffsetTranspose", &impl->ApplyTranspose)); } break; + case CEED_RESTRICTION_POINTS: { + const char restriction_kernel_source[] = + "// AtPoints restriction source\n#include \n\n" + "// Standard restriction source\n#include \n"; + + CeedCallBackend(CeedCompile_Cuda(ceed, restriction_kernel_source, &impl->module, 6, "RSTR_ELEM_SIZE", elem_size, "RSTR_NUM_ELEM", num_elem, + "RSTR_NUM_COMP", num_comp, "RSTR_NUM_NODES", impl->num_nodes, "RSTR_COMP_STRIDE", comp_stride, + "USE_DETERMINISTIC", is_deterministic ? 1 : 0)); + CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OffsetNoTranspose", &impl->ApplyNoTranspose)); + CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "AtPointsTranspose", &impl->ApplyTranspose)); + } break; case CEED_RESTRICTION_ORIENTED: { - const char *offset_kernel_path; - char **file_paths = NULL; - CeedInt num_file_paths = 0; - - CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-restriction-oriented.h", &restriction_kernel_path)); - CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source -----\n"); - CeedCallBackend(CeedLoadSourceAndInitializeBuffer(ceed, restriction_kernel_path, &num_file_paths, &file_paths, &restriction_kernel_source)); - CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-restriction-offset.h", &offset_kernel_path)); - CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, offset_kernel_path, &num_file_paths, &file_paths, &restriction_kernel_source)); - CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source Complete! -----\n"); + const char restriction_kernel_source[] = + "// Oriented restriction source\n#include \n\n" + "// Standard restriction source\n#include \n"; + CeedCallBackend(CeedCompile_Cuda(ceed, restriction_kernel_source, &impl->module, 6, "RSTR_ELEM_SIZE", elem_size, "RSTR_NUM_ELEM", num_elem, "RSTR_NUM_COMP", num_comp, "RSTR_NUM_NODES", impl->num_nodes, "RSTR_COMP_STRIDE", comp_stride, "USE_DETERMINISTIC", is_deterministic ? 1 : 0)); @@ -89,22 +90,11 @@ static inline int CeedElemRestrictionSetupCompile_Cuda(CeedElemRestriction rstr) CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OffsetNoTranspose", &impl->ApplyUnsignedNoTranspose)); CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OrientedTranspose", &impl->ApplyTranspose)); CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OffsetTranspose", &impl->ApplyUnsignedTranspose)); - // Cleanup - CeedCallBackend(CeedFree(&offset_kernel_path)); - for (CeedInt i = 0; i < num_file_paths; i++) CeedCall(CeedFree(&file_paths[i])); - CeedCall(CeedFree(&file_paths)); } break; case CEED_RESTRICTION_CURL_ORIENTED: { - const char *offset_kernel_path; - char **file_paths = NULL; - CeedInt num_file_paths = 0; - - CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-restriction-curl-oriented.h", &restriction_kernel_path)); - CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source -----\n"); - CeedCallBackend(CeedLoadSourceAndInitializeBuffer(ceed, restriction_kernel_path, &num_file_paths, &file_paths, &restriction_kernel_source)); - CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-restriction-offset.h", &offset_kernel_path)); - CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, offset_kernel_path, &num_file_paths, &file_paths, &restriction_kernel_source)); - CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source Complete! -----\n"); + const char restriction_kernel_source[] = + "// Curl oriented restriction source\n#include \n\n" + "// Standard restriction source\n#include \n"; CeedCallBackend(CeedCompile_Cuda(ceed, restriction_kernel_source, &impl->module, 6, "RSTR_ELEM_SIZE", elem_size, "RSTR_NUM_ELEM", num_elem, "RSTR_NUM_COMP", num_comp, "RSTR_NUM_NODES", impl->num_nodes, "RSTR_COMP_STRIDE", comp_stride, "USE_DETERMINISTIC", is_deterministic ? 1 : 0)); @@ -114,19 +104,9 @@ static inline int CeedElemRestrictionSetupCompile_Cuda(CeedElemRestriction rstr) CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "CurlOrientedTranspose", &impl->ApplyTranspose)); CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "CurlOrientedUnsignedTranspose", &impl->ApplyUnsignedTranspose)); CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OffsetTranspose", &impl->ApplyUnorientedTranspose)); - // Cleanup - CeedCallBackend(CeedFree(&offset_kernel_path)); - for (CeedInt i = 0; i < num_file_paths; i++) CeedCall(CeedFree(&file_paths[i])); - CeedCall(CeedFree(&file_paths)); - } break; - case CEED_RESTRICTION_POINTS: { - // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement restriction CeedElemRestrictionAtPoints"); - // LCOV_EXCL_STOP } break; } - CeedCallBackend(CeedFree(&restriction_kernel_path)); - CeedCallBackend(CeedFree(&restriction_kernel_source)); + CeedCallBackend(CeedDestroy(&ceed)); return CEED_ERROR_SUCCESS; } @@ -175,6 +155,7 @@ static inline int CeedElemRestrictionApply_Cuda_Core(CeedElemRestriction rstr, C CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyNoTranspose, grid, block_size, args)); } break; + case CEED_RESTRICTION_POINTS: case CEED_RESTRICTION_STANDARD: { void *args[] = {&impl->d_offsets, &d_u, &d_v}; @@ -206,11 +187,6 @@ static inline int CeedElemRestrictionApply_Cuda_Core(CeedElemRestriction rstr, C CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyUnorientedNoTranspose, grid, block_size, args)); } } break; - case CEED_RESTRICTION_POINTS: { - // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement restriction CeedElemRestrictionAtPoints"); - // LCOV_EXCL_STOP - } break; } } else { // E-vector -> L-vector @@ -224,6 +200,17 @@ static inline int CeedElemRestrictionApply_Cuda_Core(CeedElemRestriction rstr, C CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyTranspose, grid, block_size, args)); } break; + case CEED_RESTRICTION_POINTS: { + if (!is_deterministic) { + void *args[] = {&impl->d_offsets, &impl->d_points_per_elem, &d_u, &d_v}; + + CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyTranspose, grid, block_size, args)); + } else { + void *args[] = {&impl->d_l_vec_indices, &impl->d_t_indices, &impl->d_points_per_elem, &impl->d_t_offsets, &d_u, &d_v}; + + CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyTranspose, grid, block_size, args)); + } + } break; case CEED_RESTRICTION_STANDARD: { if (!is_deterministic) { void *args[] = {&impl->d_offsets, &d_u, &d_v}; @@ -291,11 +278,6 @@ static inline int CeedElemRestrictionApply_Cuda_Core(CeedElemRestriction rstr, C } } } break; - case CEED_RESTRICTION_POINTS: { - // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement restriction CeedElemRestrictionAtPoints"); - // LCOV_EXCL_STOP - } break; } } @@ -304,6 +286,7 @@ static inline int CeedElemRestrictionApply_Cuda_Core(CeedElemRestriction rstr, C // Restore arrays CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); + CeedCallBackend(CeedDestroy(&ceed)); return CEED_ERROR_SUCCESS; } @@ -335,14 +318,16 @@ static int CeedElemRestrictionApplyUnoriented_Cuda(CeedElemRestriction rstr, Cee //------------------------------------------------------------------------------ static int CeedElemRestrictionGetOffsets_Cuda(CeedElemRestriction rstr, CeedMemType mem_type, const CeedInt **offsets) { CeedElemRestriction_Cuda *impl; + CeedRestrictionType rstr_type; CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); + CeedCallBackend(CeedElemRestrictionGetType(rstr, &rstr_type)); switch (mem_type) { case CEED_MEM_HOST: - *offsets = impl->h_offsets; + *offsets = rstr_type == CEED_RESTRICTION_POINTS ? impl->h_offsets_at_points : impl->h_offsets; break; case CEED_MEM_DEVICE: - *offsets = impl->d_offsets; + *offsets = rstr_type == CEED_RESTRICTION_POINTS ? impl->d_offsets_at_points : impl->d_offsets; break; } return CEED_ERROR_SUCCESS; @@ -384,6 +369,17 @@ static int CeedElemRestrictionGetCurlOrientations_Cuda(CeedElemRestriction rstr, return CEED_ERROR_SUCCESS; } +//------------------------------------------------------------------------------ +// Get offset for padded AtPoints E-layout +//------------------------------------------------------------------------------ +static int CeedElemRestrictionGetAtPointsElementOffset_Cuda(CeedElemRestriction rstr, CeedInt elem, CeedSize *elem_offset) { + CeedInt layout[3]; + + CeedCallBackend(CeedElemRestrictionGetELayout(rstr, layout)); + *elem_offset = 0 * layout[0] + 0 * layout[1] + elem * layout[2]; + return CEED_ERROR_SUCCESS; +} + //------------------------------------------------------------------------------ // Destroy restriction //------------------------------------------------------------------------------ @@ -405,25 +401,31 @@ static int CeedElemRestrictionDestroy_Cuda(CeedElemRestriction rstr) { CeedCallCuda(ceed, cudaFree((bool *)impl->d_orients_owned)); CeedCallBackend(CeedFree(&impl->h_curl_orients_owned)); CeedCallCuda(ceed, cudaFree((CeedInt8 *)impl->d_curl_orients_owned)); + CeedCallBackend(CeedFree(&impl->h_offsets_at_points_owned)); + CeedCallCuda(ceed, cudaFree((CeedInt *)impl->d_offsets_at_points_owned)); + CeedCallBackend(CeedFree(&impl->h_points_per_elem_owned)); + CeedCallCuda(ceed, cudaFree((CeedInt *)impl->d_points_per_elem_owned)); CeedCallBackend(CeedFree(&impl)); + CeedCallBackend(CeedDestroy(&ceed)); return CEED_ERROR_SUCCESS; } //------------------------------------------------------------------------------ // Create transpose offsets and indices //------------------------------------------------------------------------------ -static int CeedElemRestrictionOffset_Cuda(const CeedElemRestriction rstr, const CeedInt *indices) { +static int CeedElemRestrictionOffset_Cuda(const CeedElemRestriction rstr, const CeedInt elem_size, const CeedInt *indices) { Ceed ceed; bool *is_node; CeedSize l_size; - CeedInt num_elem, elem_size, num_comp, num_nodes = 0; + CeedInt num_elem, num_comp, num_nodes = 0; CeedInt *ind_to_offset, *l_vec_indices, *t_offsets, *t_indices; + CeedRestrictionType rstr_type; CeedElemRestriction_Cuda *impl; CeedCallBackend(CeedElemRestrictionGetCeed(rstr, &ceed)); CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); CeedCallBackend(CeedElemRestrictionGetNumElements(rstr, &num_elem)); - CeedCallBackend(CeedElemRestrictionGetElementSize(rstr, &elem_size)); + CeedCallBackend(CeedElemRestrictionGetType(rstr, &rstr_type)); CeedCallBackend(CeedElemRestrictionGetLVectorSize(rstr, &l_size)); CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr, &num_comp)); const CeedInt size_indices = num_elem * elem_size; @@ -486,6 +488,7 @@ static int CeedElemRestrictionOffset_Cuda(const CeedElemRestriction rstr, const CeedCallBackend(CeedFree(&l_vec_indices)); CeedCallBackend(CeedFree(&t_offsets)); CeedCallBackend(CeedFree(&t_indices)); + CeedCallBackend(CeedDestroy(&ceed)); return CEED_ERROR_SUCCESS; } @@ -496,16 +499,27 @@ int CeedElemRestrictionCreate_Cuda(CeedMemType mem_type, CeedCopyMode copy_mode, const CeedInt8 *curl_orients, CeedElemRestriction rstr) { Ceed ceed, ceed_parent; bool is_deterministic; - CeedInt num_elem, elem_size; + CeedInt num_elem, num_comp, elem_size; CeedRestrictionType rstr_type; CeedElemRestriction_Cuda *impl; CeedCallBackend(CeedElemRestrictionGetCeed(rstr, &ceed)); CeedCallBackend(CeedGetParent(ceed, &ceed_parent)); CeedCallBackend(CeedIsDeterministic(ceed_parent, &is_deterministic)); + CeedCallBackend(CeedDestroy(&ceed_parent)); CeedCallBackend(CeedElemRestrictionGetNumElements(rstr, &num_elem)); + CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr, &num_comp)); CeedCallBackend(CeedElemRestrictionGetElementSize(rstr, &elem_size)); CeedCallBackend(CeedElemRestrictionGetType(rstr, &rstr_type)); + // Use max number of points as elem size for AtPoints restrictions + if (rstr_type == CEED_RESTRICTION_POINTS) { + CeedInt max_points = 0; + + for (CeedInt i = 0; i < num_elem; i++) { + max_points = CeedIntMax(max_points, offsets[i + 1] - offsets[i]); + } + elem_size = max_points; + } const CeedInt size = num_elem * elem_size; CeedCallBackend(CeedCalloc(1, &impl)); @@ -526,6 +540,51 @@ int CeedElemRestrictionCreate_Cuda(CeedMemType mem_type, CeedCopyMode copy_mode, } } + // Pad AtPoints indices + if (rstr_type == CEED_RESTRICTION_POINTS) { + CeedSize offsets_len = elem_size * num_elem, at_points_size = num_elem + 1; + CeedInt max_points = elem_size, *offsets_padded, *points_per_elem; + + CeedCheck(mem_type == CEED_MEM_HOST, ceed, CEED_ERROR_BACKEND, "only MemType Host supported when creating AtPoints restriction"); + CeedCallBackend(CeedMalloc(offsets_len, &offsets_padded)); + CeedCallBackend(CeedMalloc(num_elem, &points_per_elem)); + for (CeedInt i = 0; i < num_elem; i++) { + CeedInt num_points = offsets[i + 1] - offsets[i]; + CeedInt last_point = 0; + + points_per_elem[i] = num_points; + at_points_size += num_points; + // -- Copy all points in element + for (CeedInt j = 0; j < num_points; j++) { + offsets_padded[i * max_points + j] = offsets[offsets[i] + j] * num_comp; + last_point = offsets_padded[i * max_points + j]; + } + // -- Replicate out last point in element + for (CeedInt j = num_points; j < max_points; j++) { + offsets_padded[i * max_points + j] = last_point; + } + } + CeedCallBackend(CeedSetHostCeedIntArray(offsets, copy_mode, at_points_size, &impl->h_offsets_at_points_owned, &impl->h_offsets_at_points_borrowed, + &impl->h_offsets_at_points)); + CeedCallCuda(ceed, cudaMalloc((void **)&impl->d_offsets_at_points_owned, at_points_size * sizeof(CeedInt))); + CeedCallCuda(ceed, cudaMemcpy((CeedInt **)impl->d_offsets_at_points_owned, impl->h_offsets_at_points, at_points_size * sizeof(CeedInt), + cudaMemcpyHostToDevice)); + impl->d_offsets_at_points = (CeedInt *)impl->d_offsets_at_points_owned; + + // -- Use padded offsets for the rest of the setup + offsets = (const CeedInt *)offsets_padded; + copy_mode = CEED_OWN_POINTER; + CeedCallBackend(CeedElemRestrictionSetAtPointsEVectorSize(rstr, elem_size * num_elem * num_comp)); + + // -- Points per element + CeedCallBackend(CeedSetHostCeedIntArray(points_per_elem, CEED_OWN_POINTER, num_elem, &impl->h_points_per_elem_owned, + &impl->h_points_per_elem_borrowed, &impl->h_points_per_elem)); + CeedCallCuda(ceed, cudaMalloc((void **)&impl->d_points_per_elem_owned, num_elem * sizeof(CeedInt))); + CeedCallCuda(ceed, + cudaMemcpy((CeedInt **)impl->d_points_per_elem_owned, impl->h_points_per_elem, num_elem * sizeof(CeedInt), cudaMemcpyHostToDevice)); + impl->d_points_per_elem = (CeedInt *)impl->d_points_per_elem_owned; + } + // Set up device offset/orientation arrays if (rstr_type != CEED_RESTRICTION_STRIDED) { switch (mem_type) { @@ -534,7 +593,7 @@ int CeedElemRestrictionCreate_Cuda(CeedMemType mem_type, CeedCopyMode copy_mode, CeedCallCuda(ceed, cudaMalloc((void **)&impl->d_offsets_owned, size * sizeof(CeedInt))); CeedCallCuda(ceed, cudaMemcpy((CeedInt *)impl->d_offsets_owned, impl->h_offsets, size * sizeof(CeedInt), cudaMemcpyHostToDevice)); impl->d_offsets = (CeedInt *)impl->d_offsets_owned; - if (is_deterministic) CeedCallBackend(CeedElemRestrictionOffset_Cuda(rstr, offsets)); + if (is_deterministic) CeedCallBackend(CeedElemRestrictionOffset_Cuda(rstr, elem_size, offsets)); } break; case CEED_MEM_DEVICE: { CeedCallBackend(CeedSetDeviceCeedIntArray_Cuda(ceed, offsets, copy_mode, size, &impl->d_offsets_owned, &impl->d_offsets_borrowed, @@ -542,7 +601,7 @@ int CeedElemRestrictionCreate_Cuda(CeedMemType mem_type, CeedCopyMode copy_mode, CeedCallBackend(CeedMalloc(size, &impl->h_offsets_owned)); CeedCallCuda(ceed, cudaMemcpy((CeedInt *)impl->h_offsets_owned, impl->d_offsets, size * sizeof(CeedInt), cudaMemcpyDeviceToHost)); impl->h_offsets = impl->h_offsets_owned; - if (is_deterministic) CeedCallBackend(CeedElemRestrictionOffset_Cuda(rstr, offsets)); + if (is_deterministic) CeedCallBackend(CeedElemRestrictionOffset_Cuda(rstr, elem_size, offsets)); } break; } @@ -592,7 +651,12 @@ int CeedElemRestrictionCreate_Cuda(CeedMemType mem_type, CeedCopyMode copy_mode, CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "GetOffsets", CeedElemRestrictionGetOffsets_Cuda)); CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "GetOrientations", CeedElemRestrictionGetOrientations_Cuda)); CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "GetCurlOrientations", CeedElemRestrictionGetCurlOrientations_Cuda)); + if (rstr_type == CEED_RESTRICTION_POINTS) { + CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "GetAtPointsElementOffset", + CeedElemRestrictionGetAtPointsElementOffset_Cuda)); + } CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "Destroy", CeedElemRestrictionDestroy_Cuda)); + CeedCallBackend(CeedDestroy(&ceed)); return CEED_ERROR_SUCCESS; } diff --git a/backends/cuda-ref/ceed-cuda-ref-vector.c b/backends/cuda-ref/ceed-cuda-ref-vector.c index 2759b38a4c..b0489d36d6 100644 --- a/backends/cuda-ref/ceed-cuda-ref-vector.c +++ b/backends/cuda-ref/ceed-cuda-ref-vector.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -41,10 +41,8 @@ static inline int CeedVectorNeedSync_Cuda(const CeedVector vec, CeedMemType mem_ static inline int CeedVectorSyncH2D_Cuda(const CeedVector vec) { CeedSize length; size_t bytes; - Ceed ceed; CeedVector_Cuda *impl; - CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); CeedCallBackend(CeedVectorGetData(vec, &impl)); CeedCheck(impl->h_array, CeedVectorReturnCeed(vec), CEED_ERROR_BACKEND, "No valid host data to sync to device"); @@ -56,10 +54,10 @@ static inline int CeedVectorSyncH2D_Cuda(const CeedVector vec) { } else if (impl->d_array_owned) { impl->d_array = impl->d_array_owned; } else { - CeedCallCuda(ceed, cudaMalloc((void **)&impl->d_array_owned, bytes)); + CeedCallCuda(CeedVectorReturnCeed(vec), cudaMalloc((void **)&impl->d_array_owned, bytes)); impl->d_array = impl->d_array_owned; } - CeedCallCuda(ceed, cudaMemcpy(impl->d_array, impl->h_array, bytes, cudaMemcpyHostToDevice)); + CeedCallCuda(CeedVectorReturnCeed(vec), cudaMemcpy(impl->d_array, impl->h_array, bytes, cudaMemcpyHostToDevice)); return CEED_ERROR_SUCCESS; } @@ -68,13 +66,11 @@ static inline int CeedVectorSyncH2D_Cuda(const CeedVector vec) { //------------------------------------------------------------------------------ static inline int CeedVectorSyncD2H_Cuda(const CeedVector vec) { CeedSize length; - Ceed ceed; CeedVector_Cuda *impl; - CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); CeedCallBackend(CeedVectorGetData(vec, &impl)); - CeedCheck(impl->d_array, ceed, CEED_ERROR_BACKEND, "No valid device data to sync to host"); + CeedCheck(impl->d_array, CeedVectorReturnCeed(vec), CEED_ERROR_BACKEND, "No valid device data to sync to host"); if (impl->h_array_borrowed) { impl->h_array = impl->h_array_borrowed; @@ -91,7 +87,7 @@ static inline int CeedVectorSyncD2H_Cuda(const CeedVector vec) { CeedCallBackend(CeedVectorGetLength(vec, &length)); size_t bytes = length * sizeof(CeedScalar); - CeedCallCuda(ceed, cudaMemcpy(impl->h_array, impl->d_array, bytes, cudaMemcpyDeviceToHost)); + CeedCallCuda(CeedVectorReturnCeed(vec), cudaMemcpy(impl->h_array, impl->d_array, bytes, cudaMemcpyDeviceToHost)); return CEED_ERROR_SUCCESS; } @@ -111,7 +107,9 @@ static int CeedVectorSyncArray_Cuda(const CeedVector vec, CeedMemType mem_type) case CEED_MEM_DEVICE: return CeedVectorSyncH2D_Cuda(vec); } + // LCOV_EXCL_START return CEED_ERROR_UNSUPPORTED; + // LCOV_EXCL_STOP } //------------------------------------------------------------------------------ @@ -202,6 +200,7 @@ static int CeedVectorSetArrayDevice_Cuda(const CeedVector vec, const CeedCopyMod CeedCallBackend(CeedSetDeviceCeedScalarArray_Cuda(ceed, array, copy_mode, length, (const CeedScalar **)&impl->d_array_owned, (const CeedScalar **)&impl->d_array_borrowed, (const CeedScalar **)&impl->d_array)); + CeedCallBackend(CeedDestroy(&ceed)); return CEED_ERROR_SUCCESS; } @@ -220,7 +219,73 @@ static int CeedVectorSetArray_Cuda(const CeedVector vec, const CeedMemType mem_t case CEED_MEM_DEVICE: return CeedVectorSetArrayDevice_Cuda(vec, copy_mode, array); } + // LCOV_EXCL_START return CEED_ERROR_UNSUPPORTED; + // LCOV_EXCL_STOP +} + +//------------------------------------------------------------------------------ +// Copy host array to value strided +//------------------------------------------------------------------------------ +static int CeedHostCopyStrided_Cuda(CeedScalar *h_array, CeedSize start, CeedSize stop, CeedSize step, CeedScalar *h_copy_array) { + for (CeedSize i = start; i < stop; i += step) h_copy_array[i] = h_array[i]; + return CEED_ERROR_SUCCESS; +} + +//------------------------------------------------------------------------------ +// Copy device array to value strided (impl in .cu file) +//------------------------------------------------------------------------------ +int CeedDeviceCopyStrided_Cuda(CeedScalar *d_array, CeedSize start, CeedSize stop, CeedSize step, CeedScalar *d_copy_array); + +//------------------------------------------------------------------------------ +// Copy a vector to a value strided +//------------------------------------------------------------------------------ +static int CeedVectorCopyStrided_Cuda(CeedVector vec, CeedSize start, CeedSize stop, CeedSize step, CeedVector vec_copy) { + CeedSize length; + CeedVector_Cuda *impl; + + CeedCallBackend(CeedVectorGetData(vec, &impl)); + { + CeedSize length_vec, length_copy; + + CeedCallBackend(CeedVectorGetLength(vec, &length_vec)); + CeedCallBackend(CeedVectorGetLength(vec_copy, &length_copy)); + length = length_vec < length_copy ? length_vec : length_copy; + } + if (stop == -1) stop = length; + // Set value for synced device/host array + if (impl->d_array) { + CeedScalar *copy_array; + + CeedCallBackend(CeedVectorGetArray(vec_copy, CEED_MEM_DEVICE, ©_array)); +#if (CUDA_VERSION >= 12000) + cublasHandle_t handle; + Ceed ceed; + + CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); + CeedCallBackend(CeedGetCublasHandle_Cuda(ceed, &handle)); +#if defined(CEED_SCALAR_IS_FP32) + CeedCallCublas(ceed, cublasScopy_64(handle, (int64_t)(stop - start), impl->d_array + start, (int64_t)step, copy_array + start, (int64_t)step)); +#else /* CEED_SCALAR */ + CeedCallCublas(ceed, cublasDcopy_64(handle, (int64_t)(stop - start), impl->d_array + start, (int64_t)step, copy_array + start, (int64_t)step)); +#endif /* CEED_SCALAR */ + CeedCallBackend(CeedDestroy(&ceed)); +#else /* CUDA_VERSION */ + CeedCallBackend(CeedDeviceCopyStrided_Cuda(impl->d_array, start, stop, step, copy_array)); +#endif /* CUDA_VERSION */ + CeedCallBackend(CeedVectorRestoreArray(vec_copy, ©_array)); + impl->h_array = NULL; + } else if (impl->h_array) { + CeedScalar *copy_array; + + CeedCallBackend(CeedVectorGetArray(vec_copy, CEED_MEM_HOST, ©_array)); + CeedCallBackend(CeedHostCopyStrided_Cuda(impl->h_array, start, stop, step, copy_array)); + CeedCallBackend(CeedVectorRestoreArray(vec_copy, ©_array)); + impl->d_array = NULL; + } else { + return CeedError(CeedVectorReturnCeed(vec), CEED_ERROR_BACKEND, "CeedVector must have valid data set"); + } + return CEED_ERROR_SUCCESS; } //------------------------------------------------------------------------------ @@ -260,16 +325,55 @@ static int CeedVectorSetValue_Cuda(CeedVector vec, CeedScalar val) { } } if (impl->d_array) { - CeedCallBackend(CeedDeviceSetValue_Cuda(impl->d_array, length, val)); + if (val == 0) { + CeedCallCuda(CeedVectorReturnCeed(vec), cudaMemset(impl->d_array, 0, length * sizeof(CeedScalar))); + } else { + CeedCallBackend(CeedDeviceSetValue_Cuda(impl->d_array, length, val)); + } impl->h_array = NULL; - } - if (impl->h_array) { + } else if (impl->h_array) { CeedCallBackend(CeedHostSetValue_Cuda(impl->h_array, length, val)); impl->d_array = NULL; } return CEED_ERROR_SUCCESS; } +//------------------------------------------------------------------------------ +// Set host array to value strided +//------------------------------------------------------------------------------ +static int CeedHostSetValueStrided_Cuda(CeedScalar *h_array, CeedSize start, CeedSize stop, CeedSize step, CeedScalar val) { + for (CeedSize i = start; i < stop; i += step) h_array[i] = val; + return CEED_ERROR_SUCCESS; +} + +//------------------------------------------------------------------------------ +// Set device array to value strided (impl in .cu file) +//------------------------------------------------------------------------------ +int CeedDeviceSetValueStrided_Cuda(CeedScalar *d_array, CeedSize start, CeedSize stop, CeedSize step, CeedScalar val); + +//------------------------------------------------------------------------------ +// Set a vector to a value strided +//------------------------------------------------------------------------------ +static int CeedVectorSetValueStrided_Cuda(CeedVector vec, CeedSize start, CeedSize stop, CeedSize step, CeedScalar val) { + CeedSize length; + CeedVector_Cuda *impl; + + CeedCallBackend(CeedVectorGetData(vec, &impl)); + CeedCallBackend(CeedVectorGetLength(vec, &length)); + // Set value for synced device/host array + if (stop == -1) stop = length; + if (impl->d_array) { + CeedCallBackend(CeedDeviceSetValueStrided_Cuda(impl->d_array, start, stop, step, val)); + impl->h_array = NULL; + } else if (impl->h_array) { + CeedCallBackend(CeedHostSetValueStrided_Cuda(impl->h_array, start, stop, step, val)); + impl->d_array = NULL; + } else { + return CeedError(CeedVectorReturnCeed(vec), CEED_ERROR_BACKEND, "CeedVector must have valid data set"); + } + return CEED_ERROR_SUCCESS; +} + //------------------------------------------------------------------------------ // Vector Take Array //------------------------------------------------------------------------------ @@ -377,9 +481,9 @@ static int CeedVectorGetArrayWrite_Cuda(const CeedVector vec, const CeedMemType static int CeedVectorNorm_Cuda(CeedVector vec, CeedNormType type, CeedScalar *norm) { Ceed ceed; CeedSize length; -#if CUDA_VERSION < 12000 +#if (CUDA_VERSION < 12000) CeedSize num_calls; -#endif +#endif /* CUDA_VERSION */ const CeedScalar *d_array; CeedVector_Cuda *impl; cublasHandle_t handle; @@ -389,146 +493,147 @@ static int CeedVectorNorm_Cuda(CeedVector vec, CeedNormType type, CeedScalar *no CeedCallBackend(CeedVectorGetLength(vec, &length)); CeedCallBackend(CeedGetCublasHandle_Cuda(ceed, &handle)); -#if CUDA_VERSION < 12000 +#if (CUDA_VERSION < 12000) // With CUDA 12, we can use the 64-bit integer interface. Prior to that, // we need to check if the vector is too long to handle with int32, // and if so, divide it into subsections for repeated cuBLAS calls. num_calls = length / INT_MAX; if (length % INT_MAX > 0) num_calls += 1; -#endif +#endif /* CUDA_VERSION */ // Compute norm CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &d_array)); switch (type) { case CEED_NORM_1: { *norm = 0.0; - if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) { -#if CUDA_VERSION >= 12000 // We have CUDA 12, and can use 64-bit integers - CeedCallCublas(ceed, cublasSasum_64(handle, (int64_t)length, (float *)d_array, 1, (float *)norm)); -#else - float sub_norm = 0.0; - float *d_array_start; - - for (CeedInt i = 0; i < num_calls; i++) { - d_array_start = (float *)d_array + (CeedSize)(i)*INT_MAX; - CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX; - CeedInt sub_length = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX; - - CeedCallCublas(ceed, cublasSasum(handle, (CeedInt)sub_length, (float *)d_array_start, 1, &sub_norm)); - *norm += sub_norm; - } -#endif - } else { -#if CUDA_VERSION >= 12000 - CeedCallCublas(ceed, cublasDasum_64(handle, (int64_t)length, (double *)d_array, 1, (double *)norm)); -#else - double sub_norm = 0.0; - double *d_array_start; - - for (CeedInt i = 0; i < num_calls; i++) { - d_array_start = (double *)d_array + (CeedSize)(i)*INT_MAX; - CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX; - CeedInt sub_length = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX; - - CeedCallCublas(ceed, cublasDasum(handle, (CeedInt)sub_length, (double *)d_array_start, 1, &sub_norm)); - *norm += sub_norm; - } -#endif +#if defined(CEED_SCALAR_IS_FP32) +#if (CUDA_VERSION >= 12000) // We have CUDA 12, and can use 64-bit integers + CeedCallCublas(ceed, cublasSasum_64(handle, (int64_t)length, (float *)d_array, 1, (float *)norm)); +#else /* CUDA_VERSION */ + float sub_norm = 0.0; + float *d_array_start; + + for (CeedInt i = 0; i < num_calls; i++) { + d_array_start = (float *)d_array + (CeedSize)(i)*INT_MAX; + CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX; + CeedInt sub_length = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX; + + CeedCallCublas(ceed, cublasSasum(handle, (CeedInt)sub_length, (float *)d_array_start, 1, &sub_norm)); + *norm += sub_norm; } +#endif /* CUDA_VERSION */ +#else /* CEED_SCALAR */ +#if (CUDA_VERSION >= 12000) + CeedCallCublas(ceed, cublasDasum_64(handle, (int64_t)length, (double *)d_array, 1, (double *)norm)); +#else /* CUDA_VERSION */ + double sub_norm = 0.0; + double *d_array_start; + + for (CeedInt i = 0; i < num_calls; i++) { + d_array_start = (double *)d_array + (CeedSize)(i)*INT_MAX; + CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX; + CeedInt sub_length = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX; + + CeedCallCublas(ceed, cublasDasum(handle, (CeedInt)sub_length, (double *)d_array_start, 1, &sub_norm)); + *norm += sub_norm; + } +#endif /* CUDA_VERSION */ +#endif /* CEED_SCALAR */ break; } case CEED_NORM_2: { - if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) { -#if CUDA_VERSION >= 12000 - CeedCallCublas(ceed, cublasSnrm2_64(handle, (int64_t)length, (float *)d_array, 1, (float *)norm)); -#else - float sub_norm = 0.0, norm_sum = 0.0; - float *d_array_start; - - for (CeedInt i = 0; i < num_calls; i++) { - d_array_start = (float *)d_array + (CeedSize)(i)*INT_MAX; - CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX; - CeedInt sub_length = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX; - - CeedCallCublas(ceed, cublasSnrm2(handle, (CeedInt)sub_length, (float *)d_array_start, 1, &sub_norm)); - norm_sum += sub_norm * sub_norm; - } - *norm = sqrt(norm_sum); -#endif - } else { -#if CUDA_VERSION >= 12000 - CeedCallCublas(ceed, cublasDnrm2_64(handle, (int64_t)length, (double *)d_array, 1, (double *)norm)); -#else - double sub_norm = 0.0, norm_sum = 0.0; - double *d_array_start; - - for (CeedInt i = 0; i < num_calls; i++) { - d_array_start = (double *)d_array + (CeedSize)(i)*INT_MAX; - CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX; - CeedInt sub_length = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX; - - CeedCallCublas(ceed, cublasDnrm2(handle, (CeedInt)sub_length, (double *)d_array_start, 1, &sub_norm)); - norm_sum += sub_norm * sub_norm; - } - *norm = sqrt(norm_sum); -#endif +#if defined(CEED_SCALAR_IS_FP32) +#if (CUDA_VERSION >= 12000) + CeedCallCublas(ceed, cublasSnrm2_64(handle, (int64_t)length, (float *)d_array, 1, (float *)norm)); +#else /* CUDA_VERSION */ + float sub_norm = 0.0, norm_sum = 0.0; + float *d_array_start; + + for (CeedInt i = 0; i < num_calls; i++) { + d_array_start = (float *)d_array + (CeedSize)(i)*INT_MAX; + CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX; + CeedInt sub_length = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX; + + CeedCallCublas(ceed, cublasSnrm2(handle, (CeedInt)sub_length, (float *)d_array_start, 1, &sub_norm)); + norm_sum += sub_norm * sub_norm; + } + *norm = sqrt(norm_sum); +#endif /* CUDA_VERSION */ +#else /* CEED_SCALAR */ +#if (CUDA_VERSION >= 12000) + CeedCallCublas(ceed, cublasDnrm2_64(handle, (int64_t)length, (double *)d_array, 1, (double *)norm)); +#else /* CUDA_VERSION */ + double sub_norm = 0.0, norm_sum = 0.0; + double *d_array_start; + + for (CeedInt i = 0; i < num_calls; i++) { + d_array_start = (double *)d_array + (CeedSize)(i)*INT_MAX; + CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX; + CeedInt sub_length = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX; + + CeedCallCublas(ceed, cublasDnrm2(handle, (CeedInt)sub_length, (double *)d_array_start, 1, &sub_norm)); + norm_sum += sub_norm * sub_norm; } + *norm = sqrt(norm_sum); +#endif /* CUDA_VERSION */ +#endif /* CEED_SCALAR */ break; } case CEED_NORM_MAX: { - if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) { -#if CUDA_VERSION >= 12000 - int64_t index; - CeedScalar norm_no_abs; - - CeedCallCublas(ceed, cublasIsamax_64(handle, (int64_t)length, (float *)d_array, 1, &index)); - CeedCallCuda(ceed, cudaMemcpy(&norm_no_abs, impl->d_array + index - 1, sizeof(CeedScalar), cudaMemcpyDeviceToHost)); - *norm = fabs(norm_no_abs); -#else - CeedInt index; - float sub_max = 0.0, current_max = 0.0; - float *d_array_start; - - for (CeedInt i = 0; i < num_calls; i++) { - d_array_start = (float *)d_array + (CeedSize)(i)*INT_MAX; - CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX; - CeedInt sub_length = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX; - - CeedCallCublas(ceed, cublasIsamax(handle, (CeedInt)sub_length, (float *)d_array_start, 1, &index)); - CeedCallCuda(ceed, cudaMemcpy(&sub_max, d_array_start + index - 1, sizeof(CeedScalar), cudaMemcpyDeviceToHost)); - if (fabs(sub_max) > current_max) current_max = fabs(sub_max); - } - *norm = current_max; -#endif - } else { -#if CUDA_VERSION >= 12000 - int64_t index; - CeedScalar norm_no_abs; - - CeedCallCublas(ceed, cublasIdamax_64(handle, (int64_t)length, (double *)d_array, 1, &index)); - CeedCallCuda(ceed, cudaMemcpy(&norm_no_abs, impl->d_array + index - 1, sizeof(CeedScalar), cudaMemcpyDeviceToHost)); - *norm = fabs(norm_no_abs); -#else - CeedInt index; - double sub_max = 0.0, current_max = 0.0; - double *d_array_start; - - for (CeedInt i = 0; i < num_calls; i++) { - d_array_start = (double *)d_array + (CeedSize)(i)*INT_MAX; - CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX; - CeedInt sub_length = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX; - - CeedCallCublas(ceed, cublasIdamax(handle, (CeedInt)sub_length, (double *)d_array_start, 1, &index)); - CeedCallCuda(ceed, cudaMemcpy(&sub_max, d_array_start + index - 1, sizeof(CeedScalar), cudaMemcpyDeviceToHost)); - if (fabs(sub_max) > current_max) current_max = fabs(sub_max); - } - *norm = current_max; -#endif +#if defined(CEED_SCALAR_IS_FP32) +#if (CUDA_VERSION >= 12000) + int64_t index; + CeedScalar norm_no_abs; + + CeedCallCublas(ceed, cublasIsamax_64(handle, (int64_t)length, (float *)d_array, 1, &index)); + CeedCallCuda(ceed, cudaMemcpy(&norm_no_abs, impl->d_array + index - 1, sizeof(CeedScalar), cudaMemcpyDeviceToHost)); + *norm = fabs(norm_no_abs); +#else /* CUDA_VERSION */ + CeedInt index; + float sub_max = 0.0, current_max = 0.0; + float *d_array_start; + + for (CeedInt i = 0; i < num_calls; i++) { + d_array_start = (float *)d_array + (CeedSize)(i)*INT_MAX; + CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX; + CeedInt sub_length = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX; + + CeedCallCublas(ceed, cublasIsamax(handle, (CeedInt)sub_length, (float *)d_array_start, 1, &index)); + CeedCallCuda(ceed, cudaMemcpy(&sub_max, d_array_start + index - 1, sizeof(CeedScalar), cudaMemcpyDeviceToHost)); + if (fabs(sub_max) > current_max) current_max = fabs(sub_max); } + *norm = current_max; +#endif /* CUDA_VERSION */ +#else /* CEED_SCALAR */ +#if (CUDA_VERSION >= 12000) + int64_t index; + CeedScalar norm_no_abs; + + CeedCallCublas(ceed, cublasIdamax_64(handle, (int64_t)length, (double *)d_array, 1, &index)); + CeedCallCuda(ceed, cudaMemcpy(&norm_no_abs, impl->d_array + index - 1, sizeof(CeedScalar), cudaMemcpyDeviceToHost)); + *norm = fabs(norm_no_abs); +#else /* CUDA_VERSION */ + CeedInt index; + double sub_max = 0.0, current_max = 0.0; + double *d_array_start; + + for (CeedInt i = 0; i < num_calls; i++) { + d_array_start = (double *)d_array + (CeedSize)(i)*INT_MAX; + CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX; + CeedInt sub_length = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX; + + CeedCallCublas(ceed, cublasIdamax(handle, (CeedInt)sub_length, (double *)d_array_start, 1, &index)); + CeedCallCuda(ceed, cudaMemcpy(&sub_max, d_array_start + index - 1, sizeof(CeedScalar), cudaMemcpyDeviceToHost)); + if (fabs(sub_max) > current_max) current_max = fabs(sub_max); + } + *norm = current_max; +#endif /* CUDA_VERSION */ +#endif /* CEED_SCALAR */ break; } } CeedCallBackend(CeedVectorRestoreArrayRead(vec, &d_array)); + CeedCallBackend(CeedDestroy(&ceed)); return CEED_ERROR_SUCCESS; } @@ -580,13 +685,29 @@ int CeedDeviceScale_Cuda(CeedScalar *x_array, CeedScalar alpha, CeedSize length) //------------------------------------------------------------------------------ static int CeedVectorScale_Cuda(CeedVector x, CeedScalar alpha) { CeedSize length; - CeedVector_Cuda *x_impl; + CeedVector_Cuda *impl; - CeedCallBackend(CeedVectorGetData(x, &x_impl)); + CeedCallBackend(CeedVectorGetData(x, &impl)); CeedCallBackend(CeedVectorGetLength(x, &length)); // Set value for synced device/host array - if (x_impl->d_array) CeedCallBackend(CeedDeviceScale_Cuda(x_impl->d_array, alpha, length)); - if (x_impl->h_array) CeedCallBackend(CeedHostScale_Cuda(x_impl->h_array, alpha, length)); + if (impl->d_array) { +#if (CUDA_VERSION >= 12000) + cublasHandle_t handle; + + CeedCallBackend(CeedGetCublasHandle_Cuda(CeedVectorReturnCeed(x), &handle)); +#if defined(CEED_SCALAR_IS_FP32) + CeedCallCublas(CeedVectorReturnCeed(x), cublasSscal_64(handle, (int64_t)length, &alpha, impl->d_array, 1)); +#else /* CEED_SCALAR */ + CeedCallCublas(CeedVectorReturnCeed(x), cublasDscal_64(handle, (int64_t)length, &alpha, impl->d_array, 1)); +#endif /* CEED_SCALAR */ +#else /* CUDA_VERSION */ + CeedCallBackend(CeedDeviceScale_Cuda(impl->d_array, alpha, length)); +#endif /* CUDA_VERSION */ + impl->h_array = NULL; + } else if (impl->h_array) { + CeedCallBackend(CeedHostScale_Cuda(impl->h_array, alpha, length)); + impl->d_array = NULL; + } return CEED_ERROR_SUCCESS; } @@ -607,22 +728,32 @@ int CeedDeviceAXPY_Cuda(CeedScalar *y_array, CeedScalar alpha, CeedScalar *x_arr // Compute y = alpha x + y //------------------------------------------------------------------------------ static int CeedVectorAXPY_Cuda(CeedVector y, CeedScalar alpha, CeedVector x) { - Ceed ceed; CeedSize length; CeedVector_Cuda *y_impl, *x_impl; - CeedCallBackend(CeedVectorGetCeed(y, &ceed)); CeedCallBackend(CeedVectorGetData(y, &y_impl)); CeedCallBackend(CeedVectorGetData(x, &x_impl)); CeedCallBackend(CeedVectorGetLength(y, &length)); // Set value for synced device/host array if (y_impl->d_array) { CeedCallBackend(CeedVectorSyncArray(x, CEED_MEM_DEVICE)); +#if (CUDA_VERSION >= 12000) + cublasHandle_t handle; + + CeedCallBackend(CeedGetCublasHandle_Cuda(CeedVectorReturnCeed(y), &handle)); +#if defined(CEED_SCALAR_IS_FP32) + CeedCallCublas(CeedVectorReturnCeed(y), cublasSaxpy_64(handle, (int64_t)length, &alpha, x_impl->d_array, 1, y_impl->d_array, 1)); +#else /* CEED_SCALAR */ + CeedCallCublas(CeedVectorReturnCeed(y), cublasDaxpy_64(handle, (int64_t)length, &alpha, x_impl->d_array, 1, y_impl->d_array, 1)); +#endif /* CEED_SCALAR */ +#else /* CUDA_VERSION */ CeedCallBackend(CeedDeviceAXPY_Cuda(y_impl->d_array, alpha, x_impl->d_array, length)); - } - if (y_impl->h_array) { +#endif /* CUDA_VERSION */ + y_impl->h_array = NULL; + } else if (y_impl->h_array) { CeedCallBackend(CeedVectorSyncArray(x, CEED_MEM_HOST)); CeedCallBackend(CeedHostAXPY_Cuda(y_impl->h_array, alpha, x_impl->h_array, length)); + y_impl->d_array = NULL; } return CEED_ERROR_SUCCESS; } @@ -728,18 +859,21 @@ int CeedVectorCreate_Cuda(CeedSize n, CeedVector vec) { CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "HasBorrowedArrayOfType", CeedVectorHasBorrowedArrayOfType_Cuda)); CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "SetArray", CeedVectorSetArray_Cuda)); CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "TakeArray", CeedVectorTakeArray_Cuda)); - CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "SetValue", (int (*)())CeedVectorSetValue_Cuda)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "CopyStrided", CeedVectorCopyStrided_Cuda)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "SetValue", CeedVectorSetValue_Cuda)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "SetValueStrided", CeedVectorSetValueStrided_Cuda)); CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "SyncArray", CeedVectorSyncArray_Cuda)); CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "GetArray", CeedVectorGetArray_Cuda)); CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "GetArrayRead", CeedVectorGetArrayRead_Cuda)); CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "GetArrayWrite", CeedVectorGetArrayWrite_Cuda)); CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "Norm", CeedVectorNorm_Cuda)); CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "Reciprocal", CeedVectorReciprocal_Cuda)); - CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "Scale", (int (*)())CeedVectorScale_Cuda)); - CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "AXPY", (int (*)())CeedVectorAXPY_Cuda)); - CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "AXPBY", (int (*)())CeedVectorAXPBY_Cuda)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "Scale", CeedVectorScale_Cuda)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "AXPY", CeedVectorAXPY_Cuda)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "AXPBY", CeedVectorAXPBY_Cuda)); CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "PointwiseMult", CeedVectorPointwiseMult_Cuda)); CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "Destroy", CeedVectorDestroy_Cuda)); + CeedCallBackend(CeedDestroy(&ceed)); CeedCallBackend(CeedCalloc(1, &impl)); CeedCallBackend(CeedVectorSetData(vec, impl)); return CEED_ERROR_SUCCESS; diff --git a/backends/cuda-ref/ceed-cuda-ref.c b/backends/cuda-ref/ceed-cuda-ref.c index bbfa8cf875..0937b0ce17 100644 --- a/backends/cuda-ref/ceed-cuda-ref.c +++ b/backends/cuda-ref/ceed-cuda-ref.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -57,9 +57,11 @@ static int CeedInit_Cuda_ref(const char *resource, Ceed ceed) { CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "BasisCreateHdiv", CeedBasisCreateHdiv_Cuda)); CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "BasisCreateHcurl", CeedBasisCreateHcurl_Cuda)); CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "ElemRestrictionCreate", CeedElemRestrictionCreate_Cuda)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "ElemRestrictionCreateAtPoints", CeedElemRestrictionCreate_Cuda)); CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "QFunctionCreate", CeedQFunctionCreate_Cuda)); CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "QFunctionContextCreate", CeedQFunctionContextCreate_Cuda)); CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "OperatorCreate", CeedOperatorCreate_Cuda)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "OperatorCreateAtPoints", CeedOperatorCreateAtPoints_Cuda)); CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "Destroy", CeedDestroy_Cuda)); return CEED_ERROR_SUCCESS; } diff --git a/backends/cuda-ref/ceed-cuda-ref.h b/backends/cuda-ref/ceed-cuda-ref.h index 349aa8ef3a..337e7c92a0 100644 --- a/backends/cuda-ref/ceed-cuda-ref.h +++ b/backends/cuda-ref/ceed-cuda-ref.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -48,6 +48,18 @@ typedef struct { const CeedInt8 *d_curl_orients; const CeedInt8 *d_curl_orients_borrowed; const CeedInt8 *d_curl_orients_owned; + const CeedInt *h_offsets_at_points; + const CeedInt *h_offsets_at_points_borrowed; + const CeedInt *h_offsets_at_points_owned; + const CeedInt *d_offsets_at_points; + const CeedInt *d_offsets_at_points_borrowed; + const CeedInt *d_offsets_at_points_owned; + const CeedInt *h_points_per_elem; + const CeedInt *h_points_per_elem_borrowed; + const CeedInt *h_points_per_elem_owned; + const CeedInt *d_points_per_elem; + const CeedInt *d_points_per_elem_borrowed; + const CeedInt *d_points_per_elem_owned; } CeedElemRestriction_Cuda; typedef struct { @@ -55,9 +67,19 @@ typedef struct { CUfunction Interp; CUfunction Grad; CUfunction Weight; + CUmodule moduleAtPoints; + CeedInt num_points; + CUfunction InterpAtPoints; + CUfunction InterpTransposeAtPoints; + CUfunction GradAtPoints; + CUfunction GradTransposeAtPoints; CeedScalar *d_interp_1d; CeedScalar *d_grad_1d; CeedScalar *d_q_weight_1d; + CeedScalar *d_chebyshev_interp_1d; + CeedInt num_elem_at_points; + CeedInt *h_points_per_elem; + CeedInt *d_points_per_elem; } CeedBasis_Cuda; typedef struct { @@ -77,7 +99,6 @@ typedef struct { typedef struct { CUmodule module; const char *qfunction_name; - const char *qfunction_source; CUfunction QFunction; Fields_Cuda fields; void *d_c; @@ -111,12 +132,17 @@ typedef struct { } CeedOperatorAssemble_Cuda; typedef struct { - CeedVector *e_vecs; // E-vectors, inputs followed by outputs - CeedVector *q_vecs_in; // Input Q-vectors needed to apply operator - CeedVector *q_vecs_out; // Output Q-vectors needed to apply operator + bool *skip_rstr_in, *skip_rstr_out, *apply_add_basis_out; + uint64_t *input_states, points_state; // State tracking for passive inputs + CeedVector *e_vecs_in, *e_vecs_out; + CeedVector *q_vecs_in, *q_vecs_out; CeedInt num_inputs, num_outputs; CeedInt num_active_in, num_active_out; - CeedVector *qf_active_in; + CeedInt *input_field_order, *output_field_order; + CeedSize max_active_e_vec_len; + CeedInt max_num_points; + CeedInt *num_points; + CeedVector *qf_active_in, point_coords_elem; CeedOperatorDiag_Cuda *diag; CeedOperatorAssemble_Cuda *asmb; } CeedOperator_Cuda; @@ -142,3 +168,4 @@ CEED_INTERN int CeedQFunctionCreate_Cuda(CeedQFunction qf); CEED_INTERN int CeedQFunctionContextCreate_Cuda(CeedQFunctionContext ctx); CEED_INTERN int CeedOperatorCreate_Cuda(CeedOperator op); +CEED_INTERN int CeedOperatorCreateAtPoints_Cuda(CeedOperator op); diff --git a/backends/cuda-ref/kernels/cuda-ref-vector.cu b/backends/cuda-ref/kernels/cuda-ref-vector.cu index 51c5565308..cae3bad181 100644 --- a/backends/cuda-ref/kernels/cuda-ref-vector.cu +++ b/backends/cuda-ref/kernels/cuda-ref-vector.cu @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -8,159 +8,188 @@ #include #include +//------------------------------------------------------------------------------ +// Kernel for copy strided on device +//------------------------------------------------------------------------------ +__global__ static void copyStridedK(CeedScalar *__restrict__ vec, CeedSize start, CeedSize stop, CeedSize step, CeedScalar *__restrict__ vec_copy) { + const CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; + + if (index < stop - start) { + if (index % step == 0) vec_copy[start + index] = vec[start + index]; + } +} + +//------------------------------------------------------------------------------ +// Copy strided on device memory +//------------------------------------------------------------------------------ +extern "C" int CeedDeviceCopyStrided_Cuda(CeedScalar *d_array, CeedSize start, CeedSize stop, CeedSize step, CeedScalar *d_copy_array) { + const int block_size = 512; + const CeedSize copy_size = stop - start; + int grid_size = copy_size / block_size; + + if (block_size * grid_size < copy_size) grid_size += 1; + copyStridedK<<>>(d_array, start, stop, step, d_copy_array); + return 0; +} + //------------------------------------------------------------------------------ // Kernel for set value on device //------------------------------------------------------------------------------ -__global__ static void setValueK(CeedScalar * __restrict__ vec, CeedSize size, - CeedScalar val) { - CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; - if (index >= size) - return; - vec[index] = val; +__global__ static void setValueK(CeedScalar *__restrict__ vec, CeedSize size, CeedScalar val) { + const CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; + + if (index < size) vec[index] = val; } //------------------------------------------------------------------------------ // Set value on device memory //------------------------------------------------------------------------------ -extern "C" int CeedDeviceSetValue_Cuda(CeedScalar* d_array, CeedSize length, - CeedScalar val) { - const int block_size = 512; - const CeedSize vec_size = length; - int grid_size = vec_size / block_size; +extern "C" int CeedDeviceSetValue_Cuda(CeedScalar *d_array, CeedSize length, CeedScalar val) { + const int block_size = 512; + const CeedSize vec_size = length; + int grid_size = vec_size / block_size; + + if (block_size * grid_size < vec_size) grid_size += 1; + setValueK<<>>(d_array, length, val); + return 0; +} + +//------------------------------------------------------------------------------ +// Kernel for set value strided on device +//------------------------------------------------------------------------------ +__global__ static void setValueStridedK(CeedScalar *__restrict__ vec, CeedSize start, CeedSize stop, CeedSize step, CeedScalar val) { + const CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; + + if (index < stop - start) { + if (index % step == 0) vec[start + index] = val; + } +} + +//------------------------------------------------------------------------------ +// Set value strided on device memory +//------------------------------------------------------------------------------ +extern "C" int CeedDeviceSetValueStrided_Cuda(CeedScalar *d_array, CeedSize start, CeedSize stop, CeedSize step, CeedScalar val) { + const int block_size = 512; + const CeedSize set_size = stop - start; + int grid_size = set_size / block_size; - if (block_size * grid_size < vec_size) - grid_size += 1; - setValueK<<>>(d_array, length, val); + if (block_size * grid_size < set_size) grid_size += 1; + setValueStridedK<<>>(d_array, start, stop, step, val); return 0; } //------------------------------------------------------------------------------ // Kernel for taking reciprocal //------------------------------------------------------------------------------ -__global__ static void rcpValueK(CeedScalar * __restrict__ vec, CeedSize size) { - CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; - if (index >= size) - return; - if (fabs(vec[index]) > 1E-16) - vec[index] = 1./vec[index]; +__global__ static void rcpValueK(CeedScalar *__restrict__ vec, CeedSize size) { + const CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; + + if (index < size) { + if (fabs(vec[index]) > 1E-16) vec[index] = 1. / vec[index]; + } } //------------------------------------------------------------------------------ // Take vector reciprocal in device memory //------------------------------------------------------------------------------ -extern "C" int CeedDeviceReciprocal_Cuda(CeedScalar* d_array, CeedSize length) { - const int block_size = 512; - const CeedSize vec_size = length; - int grid_size = vec_size / block_size; +extern "C" int CeedDeviceReciprocal_Cuda(CeedScalar *d_array, CeedSize length) { + const int block_size = 512; + const CeedSize vec_size = length; + int grid_size = vec_size / block_size; - if (block_size * grid_size < vec_size) - grid_size += 1; - rcpValueK<<>>(d_array, length); + if (block_size * grid_size < vec_size) grid_size += 1; + rcpValueK<<>>(d_array, length); return 0; } //------------------------------------------------------------------------------ // Kernel for scale //------------------------------------------------------------------------------ -__global__ static void scaleValueK(CeedScalar * __restrict__ x, CeedScalar alpha, - CeedSize size) { - CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; - if (index >= size) - return; - x[index] *= alpha; +__global__ static void scaleValueK(CeedScalar *__restrict__ x, CeedScalar alpha, CeedSize size) { + const CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; + + if (index < size) x[index] *= alpha; } //------------------------------------------------------------------------------ // Compute x = alpha x on device //------------------------------------------------------------------------------ -extern "C" int CeedDeviceScale_Cuda(CeedScalar *x_array, CeedScalar alpha, - CeedSize length) { - const int block_size = 512; - const CeedSize vec_size = length; - int grid_size = vec_size / block_size; +extern "C" int CeedDeviceScale_Cuda(CeedScalar *x_array, CeedScalar alpha, CeedSize length) { + const int block_size = 512; + const CeedSize vec_size = length; + int grid_size = vec_size / block_size; - if (block_size * grid_size < vec_size) - grid_size += 1; - scaleValueK<<>>(x_array, alpha, length); + if (block_size * grid_size < vec_size) grid_size += 1; + scaleValueK<<>>(x_array, alpha, length); return 0; } //------------------------------------------------------------------------------ // Kernel for axpy //------------------------------------------------------------------------------ -__global__ static void axpyValueK(CeedScalar * __restrict__ y, CeedScalar alpha, - CeedScalar * __restrict__ x, CeedSize size) { - CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; - if (index >= size) - return; - y[index] += alpha * x[index]; +__global__ static void axpyValueK(CeedScalar *__restrict__ y, CeedScalar alpha, CeedScalar *__restrict__ x, CeedSize size) { + const CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; + + if (index < size) y[index] += alpha * x[index]; } //------------------------------------------------------------------------------ // Compute y = alpha x + y on device //------------------------------------------------------------------------------ -extern "C" int CeedDeviceAXPY_Cuda(CeedScalar *y_array, CeedScalar alpha, - CeedScalar *x_array, CeedSize length) { - const int block_size = 512; - const CeedSize vec_size = length; - int grid_size = vec_size / block_size; +extern "C" int CeedDeviceAXPY_Cuda(CeedScalar *y_array, CeedScalar alpha, CeedScalar *x_array, CeedSize length) { + const int block_size = 512; + const CeedSize vec_size = length; + int grid_size = vec_size / block_size; - if (block_size * grid_size < vec_size) - grid_size += 1; - axpyValueK<<>>(y_array, alpha, x_array, length); + if (block_size * grid_size < vec_size) grid_size += 1; + axpyValueK<<>>(y_array, alpha, x_array, length); return 0; } //------------------------------------------------------------------------------ // Kernel for axpby //------------------------------------------------------------------------------ -__global__ static void axpbyValueK(CeedScalar * __restrict__ y, CeedScalar alpha, CeedScalar beta, - CeedScalar * __restrict__ x, CeedSize size) { - CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; - if (index >= size) - return; - y[index] = beta * y[index]; - y[index] += alpha * x[index]; +__global__ static void axpbyValueK(CeedScalar *__restrict__ y, CeedScalar alpha, CeedScalar beta, CeedScalar *__restrict__ x, CeedSize size) { + const CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; + + if (index < size) { + y[index] = beta * y[index]; + y[index] += alpha * x[index]; + } } //------------------------------------------------------------------------------ // Compute y = alpha x + beta y on device //------------------------------------------------------------------------------ -extern "C" int CeedDeviceAXPBY_Cuda(CeedScalar *y_array, CeedScalar alpha, CeedScalar beta, - CeedScalar *x_array, CeedSize length) { - const int block_size = 512; - const CeedSize vec_size = length; - int grid_size = vec_size / block_size; +extern "C" int CeedDeviceAXPBY_Cuda(CeedScalar *y_array, CeedScalar alpha, CeedScalar beta, CeedScalar *x_array, CeedSize length) { + const int block_size = 512; + const CeedSize vec_size = length; + int grid_size = vec_size / block_size; - if (block_size * grid_size < vec_size) - grid_size += 1; - axpbyValueK<<>>(y_array, alpha, beta, x_array, length); + if (block_size * grid_size < vec_size) grid_size += 1; + axpbyValueK<<>>(y_array, alpha, beta, x_array, length); return 0; } //------------------------------------------------------------------------------ // Kernel for pointwise mult //------------------------------------------------------------------------------ -__global__ static void pointwiseMultValueK(CeedScalar * __restrict__ w, - CeedScalar * x, CeedScalar * __restrict__ y, CeedSize size) { - CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; - if (index >= size) - return; - w[index] = x[index] * y[index]; +__global__ static void pointwiseMultValueK(CeedScalar *__restrict__ w, CeedScalar *x, CeedScalar *__restrict__ y, CeedSize size) { + const CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; + + if (index < size) w[index] = x[index] * y[index]; } //------------------------------------------------------------------------------ // Compute the pointwise multiplication w = x .* y on device //------------------------------------------------------------------------------ -extern "C" int CeedDevicePointwiseMult_Cuda(CeedScalar *w_array, CeedScalar *x_array, - CeedScalar *y_array, CeedSize length) { - const int block_size = 512; - const CeedSize vec_size = length; - int grid_size = vec_size / block_size; +extern "C" int CeedDevicePointwiseMult_Cuda(CeedScalar *w_array, CeedScalar *x_array, CeedScalar *y_array, CeedSize length) { + const int block_size = 512; + const CeedSize vec_size = length; + int grid_size = vec_size / block_size; - if (block_size * grid_size < vec_size) - grid_size += 1; - pointwiseMultValueK<<>>(w_array, x_array, y_array, length); + if (block_size * grid_size < vec_size) grid_size += 1; + pointwiseMultValueK<<>>(w_array, x_array, y_array, length); return 0; } diff --git a/backends/cuda-shared/ceed-cuda-shared-basis.c b/backends/cuda-shared/ceed-cuda-shared-basis.c index c22bce82da..885e5f0979 100644 --- a/backends/cuda-shared/ceed-cuda-shared-basis.c +++ b/backends/cuda-shared/ceed-cuda-shared-basis.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -12,23 +12,17 @@ #include #include #include +#include #include "../cuda/ceed-cuda-common.h" #include "../cuda/ceed-cuda-compile.h" #include "ceed-cuda-shared.h" //------------------------------------------------------------------------------ -// Device initalization +// Apply tensor basis //------------------------------------------------------------------------------ -int CeedInit_CudaInterp(CeedScalar *d_B, CeedInt P_1d, CeedInt Q_1d, CeedScalar **c_B); -int CeedInit_CudaGrad(CeedScalar *d_B, CeedScalar *d_G, CeedInt P_1d, CeedInt Q_1d, CeedScalar **c_B_ptr, CeedScalar **c_G_ptr); -int CeedInit_CudaCollocatedGrad(CeedScalar *d_B, CeedScalar *d_G, CeedInt P_1d, CeedInt Q_1d, CeedScalar **c_B_ptr, CeedScalar **c_G_ptr); - -//------------------------------------------------------------------------------ -// Apply basis -//------------------------------------------------------------------------------ -int CeedBasisApplyTensor_Cuda_shared(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, - CeedVector v) { +static int CeedBasisApplyTensorCore_Cuda_shared(CeedBasis basis, bool apply_add, const CeedInt num_elem, CeedTransposeMode t_mode, + CeedEvalMode eval_mode, CeedVector u, CeedVector v) { Ceed ceed; Ceed_Cuda *ceed_Cuda; CeedInt dim, num_comp; @@ -45,102 +39,113 @@ int CeedBasisApplyTensor_Cuda_shared(CeedBasis basis, const CeedInt num_elem, Ce // Get read/write access to u, v if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); - CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); + if (apply_add) { + CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v)); + } else { + CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); + } // Apply basis operation switch (eval_mode) { case CEED_EVAL_INTERP: { CeedInt P_1d, Q_1d; + CeedCheck(data->d_interp_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; interp_1d not set", CeedEvalModes[eval_mode]); CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); CeedInt thread_1d = CeedIntMax(Q_1d, P_1d); - CeedCallBackend(CeedInit_CudaInterp(data->d_interp_1d, P_1d, Q_1d, &data->c_B)); - void *interp_args[] = {(void *)&num_elem, &data->c_B, &d_u, &d_v}; + void *interp_args[] = {(void *)&num_elem, &data->d_interp_1d, &d_u, &d_v}; if (dim == 1) { - CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread_1d, - 1)); // avoid >512 total threads - CeedInt grid = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0); + // avoid >512 total threads + CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread_1d, 1)); + CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); CeedInt shared_mem = elems_per_block * thread_1d * sizeof(CeedScalar); if (t_mode == CEED_TRANSPOSE) { - CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->InterpTranspose, grid, thread_1d, 1, elems_per_block, shared_mem, interp_args)); + CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, NULL, grid, thread_1d, 1, + elems_per_block, shared_mem, interp_args)); } else { - CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Interp, grid, thread_1d, 1, elems_per_block, shared_mem, interp_args)); + CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Interp, NULL, grid, thread_1d, 1, elems_per_block, shared_mem, interp_args)); } } else if (dim == 2) { const CeedInt opt_elems[7] = {0, 32, 8, 6, 4, 2, 8}; // elems_per_block must be at least 1 CeedInt elems_per_block = CeedIntMax(thread_1d < 7 ? opt_elems[thread_1d] / num_comp : 1, 1); - CeedInt grid = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0); + CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); if (t_mode == CEED_TRANSPOSE) { - CeedCallBackend( - CeedRunKernelDimShared_Cuda(ceed, data->InterpTranspose, grid, thread_1d, thread_1d, elems_per_block, shared_mem, interp_args)); + CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, NULL, grid, thread_1d, + thread_1d, elems_per_block, shared_mem, interp_args)); } else { - CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Interp, grid, thread_1d, thread_1d, elems_per_block, shared_mem, interp_args)); + CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Interp, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem, + interp_args)); } } else if (dim == 3) { CeedInt elems_per_block = 1; - CeedInt grid = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0); + CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); if (t_mode == CEED_TRANSPOSE) { - CeedCallBackend( - CeedRunKernelDimShared_Cuda(ceed, data->InterpTranspose, grid, thread_1d, thread_1d, elems_per_block, shared_mem, interp_args)); + CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, NULL, grid, thread_1d, + thread_1d, elems_per_block, shared_mem, interp_args)); } else { - CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Interp, grid, thread_1d, thread_1d, elems_per_block, shared_mem, interp_args)); + CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Interp, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem, + interp_args)); } } } break; case CEED_EVAL_GRAD: { CeedInt P_1d, Q_1d; + CeedCheck(data->d_grad_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; grad_1d not set", CeedEvalModes[eval_mode]); CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); - CeedInt thread_1d = CeedIntMax(Q_1d, P_1d); + CeedInt thread_1d = CeedIntMax(Q_1d, P_1d); + CeedScalar *d_grad_1d = data->d_grad_1d; if (data->d_collo_grad_1d) { - CeedCallBackend(CeedInit_CudaCollocatedGrad(data->d_interp_1d, data->d_collo_grad_1d, P_1d, Q_1d, &data->c_B, &data->c_G)); - } else { - CeedCallBackend(CeedInit_CudaGrad(data->d_interp_1d, data->d_grad_1d, P_1d, Q_1d, &data->c_B, &data->c_G)); + d_grad_1d = data->d_collo_grad_1d; } - void *grad_args[] = {(void *)&num_elem, &data->c_B, &data->c_G, &d_u, &d_v}; + void *grad_args[] = {(void *)&num_elem, &data->d_interp_1d, &d_grad_1d, &d_u, &d_v}; + if (dim == 1) { - CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread_1d, - 1)); // avoid >512 total threads - CeedInt grid = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0); + // avoid >512 total threads + CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread_1d, 1)); + CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); CeedInt shared_mem = elems_per_block * thread_1d * sizeof(CeedScalar); if (t_mode == CEED_TRANSPOSE) { - CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->GradTranspose, grid, thread_1d, 1, elems_per_block, shared_mem, grad_args)); + CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->GradTransposeAdd : data->GradTranspose, NULL, grid, thread_1d, 1, + elems_per_block, shared_mem, grad_args)); } else { - CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Grad, grid, thread_1d, 1, elems_per_block, shared_mem, grad_args)); + CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Grad, NULL, grid, thread_1d, 1, elems_per_block, shared_mem, grad_args)); } } else if (dim == 2) { const CeedInt opt_elems[7] = {0, 32, 8, 6, 4, 2, 8}; // elems_per_block must be at least 1 CeedInt elems_per_block = CeedIntMax(thread_1d < 7 ? opt_elems[thread_1d] / num_comp : 1, 1); - CeedInt grid = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0); + CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); if (t_mode == CEED_TRANSPOSE) { - CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->GradTranspose, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args)); + CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->GradTransposeAdd : data->GradTranspose, NULL, grid, thread_1d, + thread_1d, elems_per_block, shared_mem, grad_args)); } else { - CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Grad, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args)); + CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Grad, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args)); } } else if (dim == 3) { CeedInt elems_per_block = 1; - CeedInt grid = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0); + CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); if (t_mode == CEED_TRANSPOSE) { - CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->GradTranspose, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args)); + CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->GradTransposeAdd : data->GradTranspose, NULL, grid, thread_1d, + thread_1d, elems_per_block, shared_mem, grad_args)); } else { - CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Grad, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args)); + CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Grad, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args)); } } } break; @@ -148,23 +153,24 @@ int CeedBasisApplyTensor_Cuda_shared(CeedBasis basis, const CeedInt num_elem, Ce CeedInt Q_1d; CeedInt block_size = 32; + CeedCheck(data->d_q_weight_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weights_1d not set", CeedEvalModes[eval_mode]); CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); void *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight_1d, &d_v}; if (dim == 1) { const CeedInt elems_per_block = block_size / Q_1d; - const CeedInt grid_size = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0); + const CeedInt grid_size = num_elem / elems_per_block + (num_elem % elems_per_block > 0); CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Weight, grid_size, Q_1d, elems_per_block, 1, weight_args)); } else if (dim == 2) { const CeedInt opt_elems = block_size / (Q_1d * Q_1d); const CeedInt elems_per_block = opt_elems > 0 ? opt_elems : 1; - const CeedInt grid_size = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0); + const CeedInt grid_size = num_elem / elems_per_block + (num_elem % elems_per_block > 0); CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Weight, grid_size, Q_1d, Q_1d, elems_per_block, weight_args)); } else if (dim == 3) { const CeedInt opt_elems = block_size / (Q_1d * Q_1d); const CeedInt elems_per_block = opt_elems > 0 ? opt_elems : 1; - const CeedInt grid_size = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0); + const CeedInt grid_size = num_elem / elems_per_block + (num_elem % elems_per_block > 0); CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Weight, grid_size, Q_1d, Q_1d, elems_per_block, weight_args)); } @@ -182,6 +188,380 @@ int CeedBasisApplyTensor_Cuda_shared(CeedBasis basis, const CeedInt num_elem, Ce CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u)); if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); + CeedCallBackend(CeedDestroy(&ceed)); + return CEED_ERROR_SUCCESS; +} + +static int CeedBasisApplyTensor_Cuda_shared(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, + CeedVector v) { + CeedCallBackend(CeedBasisApplyTensorCore_Cuda_shared(basis, false, num_elem, t_mode, eval_mode, u, v)); + return CEED_ERROR_SUCCESS; +} + +static int CeedBasisApplyAddTensor_Cuda_shared(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, + CeedVector u, CeedVector v) { + CeedCallBackend(CeedBasisApplyTensorCore_Cuda_shared(basis, true, num_elem, t_mode, eval_mode, u, v)); + return CEED_ERROR_SUCCESS; +} + +//------------------------------------------------------------------------------ +// Basis apply - tensor AtPoints +//------------------------------------------------------------------------------ +static int CeedBasisApplyAtPointsCore_Cuda_shared(CeedBasis basis, bool apply_add, const CeedInt num_elem, const CeedInt *num_points, + CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) { + Ceed ceed; + Ceed_Cuda *ceed_Cuda; + CeedInt Q_1d, dim, num_comp, max_num_points = num_points[0]; + const CeedInt is_transpose = t_mode == CEED_TRANSPOSE; + const CeedScalar *d_x, *d_u; + CeedScalar *d_v; + CeedBasis_Cuda_shared *data; + + CeedCallBackend(CeedBasisGetData(basis, &data)); + CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); + CeedCallBackend(CeedBasisGetDimension(basis, &dim)); + CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); + + // Weight handled separately + if (eval_mode == CEED_EVAL_WEIGHT) { + CeedCallBackend(CeedVectorSetValue(v, 1.0)); + return CEED_ERROR_SUCCESS; + } + + CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); + CeedCallBackend(CeedGetData(ceed, &ceed_Cuda)); + + // Check padded to uniform number of points per elem + for (CeedInt i = 1; i < num_elem; i++) max_num_points = CeedIntMax(max_num_points, num_points[i]); + { + CeedInt q_comp; + CeedSize len, len_required; + CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, eval_mode, &q_comp)); + CeedCallBackend(CeedVectorGetLength(is_transpose ? u : v, &len)); + len_required = (CeedSize)num_comp * (CeedSize)q_comp * (CeedSize)num_elem * (CeedSize)max_num_points; + CeedCheck(len >= len_required, ceed, CEED_ERROR_BACKEND, + "Vector at points must be padded to the same number of points in each element for BasisApplyAtPoints on GPU backends." + " Found %" CeedSize_FMT ", Required %" CeedSize_FMT, + len, len_required); + } + + // Move num_points array to device + if (is_transpose) { + const CeedInt num_bytes = num_elem * sizeof(CeedInt); + + if (num_elem != data->num_elem_at_points) { + data->num_elem_at_points = num_elem; + + if (data->d_points_per_elem) CeedCallCuda(ceed, cudaFree(data->d_points_per_elem)); + CeedCallCuda(ceed, cudaMalloc((void **)&data->d_points_per_elem, num_bytes)); + CeedCallBackend(CeedFree(&data->h_points_per_elem)); + CeedCallBackend(CeedCalloc(num_elem, &data->h_points_per_elem)); + } + if (memcmp(data->h_points_per_elem, num_points, num_bytes)) { + memcpy(data->h_points_per_elem, num_points, num_bytes); + CeedCallCuda(ceed, cudaMemcpy(data->d_points_per_elem, num_points, num_bytes, cudaMemcpyHostToDevice)); + } + } + + // Build kernels if needed + if (data->num_points != max_num_points) { + CeedInt P_1d; + + CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); + data->num_points = max_num_points; + + // -- Create interp matrix to Chebyshev coefficients + if (!data->d_chebyshev_interp_1d) { + CeedSize interp_bytes; + CeedScalar *chebyshev_interp_1d; + + interp_bytes = P_1d * Q_1d * sizeof(CeedScalar); + CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d)); + CeedCallBackend(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d)); + CeedCallCuda(ceed, cudaMalloc((void **)&data->d_chebyshev_interp_1d, interp_bytes)); + CeedCallCuda(ceed, cudaMemcpy(data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, cudaMemcpyHostToDevice)); + CeedCallBackend(CeedFree(&chebyshev_interp_1d)); + } + + // -- Compile kernels + const char basis_kernel_source[] = "// AtPoints basis source\n#include \n"; + CeedInt num_comp; + + if (data->moduleAtPoints) CeedCallCuda(ceed, cuModuleUnload(data->moduleAtPoints)); + CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); + CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->moduleAtPoints, 8, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "BASIS_T_1D", + CeedIntMax(Q_1d, P_1d), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp, "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), + "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim), "BASIS_NUM_PTS", max_num_points)); + CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "InterpAtPoints", &data->InterpAtPoints)); + CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "InterpTransposeAtPoints", &data->InterpTransposeAtPoints)); + CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "InterpTransposeAddAtPoints", &data->InterpTransposeAddAtPoints)); + CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "GradAtPoints", &data->GradAtPoints)); + CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "GradTransposeAtPoints", &data->GradTransposeAtPoints)); + CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "GradTransposeAddAtPoints", &data->GradTransposeAddAtPoints)); + } + + // Get read/write access to u, v + CeedCallBackend(CeedVectorGetArrayRead(x_ref, CEED_MEM_DEVICE, &d_x)); + if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); + else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); + if (apply_add) { + CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v)); + } else { + CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); + } + + // Basis action + switch (eval_mode) { + case CEED_EVAL_INTERP: { + CeedInt P_1d, Q_1d; + + CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); + CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); + CeedInt thread_1d = CeedIntMax(Q_1d, P_1d); + + void *interp_args[] = {(void *)&num_elem, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v}; + + if (dim == 1) { + // avoid >512 total threads + CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread_1d, 1)); + CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); + CeedInt shared_mem = elems_per_block * thread_1d * sizeof(CeedScalar); + + if (is_transpose) { + CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->InterpTransposeAddAtPoints : data->InterpTransposeAtPoints, NULL, grid, + thread_1d, 1, elems_per_block, shared_mem, interp_args)); + } else { + CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->InterpAtPoints, NULL, grid, thread_1d, 1, elems_per_block, shared_mem, + interp_args)); + } + } else if (dim == 2) { + const CeedInt opt_elems[7] = {0, 32, 8, 6, 4, 2, 8}; + // elems_per_block must be at least 1 + CeedInt elems_per_block = CeedIntMax(thread_1d < 7 ? opt_elems[thread_1d] / num_comp : 1, 1); + CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); + CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); + + if (is_transpose) { + CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->InterpTransposeAddAtPoints : data->InterpTransposeAtPoints, NULL, grid, + thread_1d, thread_1d, elems_per_block, shared_mem, interp_args)); + } else { + CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->InterpAtPoints, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem, + interp_args)); + } + } else if (dim == 3) { + CeedInt elems_per_block = 1; + CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); + CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); + + if (is_transpose) { + CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->InterpTransposeAddAtPoints : data->InterpTransposeAtPoints, NULL, grid, + thread_1d, thread_1d, elems_per_block, shared_mem, interp_args)); + } else { + CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->InterpAtPoints, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem, + interp_args)); + } + } + } break; + case CEED_EVAL_GRAD: { + CeedInt P_1d, Q_1d; + + CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); + CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); + CeedInt thread_1d = CeedIntMax(Q_1d, P_1d); + + void *grad_args[] = {(void *)&num_elem, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v}; + + if (dim == 1) { + // avoid >512 total threads + CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread_1d, 1)); + CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); + CeedInt shared_mem = elems_per_block * thread_1d * sizeof(CeedScalar); + + if (is_transpose) { + CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->GradTransposeAddAtPoints : data->GradTransposeAtPoints, NULL, grid, + thread_1d, 1, elems_per_block, shared_mem, grad_args)); + } else { + CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->GradAtPoints, NULL, grid, thread_1d, 1, elems_per_block, shared_mem, grad_args)); + } + } else if (dim == 2) { + const CeedInt opt_elems[7] = {0, 32, 8, 6, 4, 2, 8}; + // elems_per_block must be at least 1 + CeedInt elems_per_block = CeedIntMax(thread_1d < 7 ? opt_elems[thread_1d] / num_comp : 1, 1); + CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); + CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); + + if (is_transpose) { + CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->GradTransposeAddAtPoints : data->GradTransposeAtPoints, NULL, grid, + thread_1d, thread_1d, elems_per_block, shared_mem, grad_args)); + } else { + CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->GradAtPoints, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem, + grad_args)); + } + } else if (dim == 3) { + CeedInt elems_per_block = 1; + CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); + CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); + + if (is_transpose) { + CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->GradTransposeAddAtPoints : data->GradTransposeAtPoints, NULL, grid, + thread_1d, thread_1d, elems_per_block, shared_mem, grad_args)); + } else { + CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->GradAtPoints, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem, + grad_args)); + } + } + } break; + case CEED_EVAL_WEIGHT: + case CEED_EVAL_NONE: /* handled separately below */ + break; + // LCOV_EXCL_START + case CEED_EVAL_DIV: + case CEED_EVAL_CURL: + return CeedError(ceed, CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]); + // LCOV_EXCL_STOP + } + + // Restore vectors, cover CEED_EVAL_NONE + CeedCallBackend(CeedVectorRestoreArrayRead(x_ref, &d_x)); + CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); + if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u)); + if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); + CeedCallBackend(CeedDestroy(&ceed)); + return CEED_ERROR_SUCCESS; +} + +static int CeedBasisApplyAtPoints_Cuda_shared(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, + CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) { + CeedCallBackend(CeedBasisApplyAtPointsCore_Cuda_shared(basis, false, num_elem, num_points, t_mode, eval_mode, x_ref, u, v)); + return CEED_ERROR_SUCCESS; +} + +static int CeedBasisApplyAddAtPoints_Cuda_shared(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, + CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) { + CeedCallBackend(CeedBasisApplyAtPointsCore_Cuda_shared(basis, true, num_elem, num_points, t_mode, eval_mode, x_ref, u, v)); + return CEED_ERROR_SUCCESS; +} + +//------------------------------------------------------------------------------ +// Apply non-tensor basis +//------------------------------------------------------------------------------ +static int CeedBasisApplyNonTensorCore_Cuda_shared(CeedBasis basis, bool apply_add, const CeedInt num_elem, CeedTransposeMode t_mode, + CeedEvalMode eval_mode, CeedVector u, CeedVector v) { + Ceed ceed; + Ceed_Cuda *ceed_Cuda; + CeedInt dim; + const CeedScalar *d_u; + CeedScalar *d_v; + CeedBasis_Cuda_shared *data; + + CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); + CeedCallBackend(CeedGetData(ceed, &ceed_Cuda)); + CeedCallBackend(CeedBasisGetData(basis, &data)); + CeedCallBackend(CeedBasisGetDimension(basis, &dim)); + + // Get read/write access to u, v + if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); + else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); + if (apply_add) { + CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v)); + } else { + CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); + } + + // Apply basis operation + switch (eval_mode) { + case CEED_EVAL_INTERP: { + CeedInt P, Q; + + CeedCheck(data->d_interp_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; interp not set", CeedEvalModes[eval_mode]); + CeedCallBackend(CeedBasisGetNumNodes(basis, &P)); + CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &Q)); + CeedInt thread = CeedIntMax(Q, P); + + void *interp_args[] = {(void *)&num_elem, &data->d_interp_1d, &d_u, &d_v}; + + { + // avoid >512 total threads + CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread, 1)); + CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); + CeedInt shared_mem = elems_per_block * thread * sizeof(CeedScalar); + + if (t_mode == CEED_TRANSPOSE) { + CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, NULL, grid, thread, 1, + elems_per_block, shared_mem, interp_args)); + } else { + CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Interp, NULL, grid, thread, 1, elems_per_block, shared_mem, interp_args)); + } + } + } break; + case CEED_EVAL_GRAD: { + CeedInt P, Q; + + CeedCheck(data->d_grad_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; grad not set", CeedEvalModes[eval_mode]); + CeedCallBackend(CeedBasisGetNumNodes(basis, &P)); + CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &Q)); + CeedInt thread = CeedIntMax(Q, P); + + void *grad_args[] = {(void *)&num_elem, &data->d_grad_1d, &d_u, &d_v}; + + { + // avoid >512 total threads + CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread, 1)); + CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); + CeedInt shared_mem = elems_per_block * thread * sizeof(CeedScalar); + + if (t_mode == CEED_TRANSPOSE) { + CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->GradTransposeAdd : data->GradTranspose, NULL, grid, thread, 1, + elems_per_block, shared_mem, grad_args)); + } else { + CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Grad, NULL, grid, thread, 1, elems_per_block, shared_mem, grad_args)); + } + } + } break; + case CEED_EVAL_WEIGHT: { + CeedInt P, Q; + + CeedCheck(data->d_q_weight_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weights not set", CeedEvalModes[eval_mode]); + CeedCallBackend(CeedBasisGetNumNodes(basis, &P)); + CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &Q)); + CeedInt thread = CeedIntMax(Q, P); + + void *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight_1d, &d_v}; + + { + // avoid >512 total threads + CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread, 1)); + CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); + + CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Weight, grid, thread, elems_per_block, 1, weight_args)); + } + } break; + case CEED_EVAL_NONE: /* handled separately below */ + break; + // LCOV_EXCL_START + case CEED_EVAL_DIV: + case CEED_EVAL_CURL: + return CeedError(ceed, CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]); + // LCOV_EXCL_STOP + } + + // Restore vectors, cover CEED_EVAL_NONE + CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); + if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u)); + if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); + CeedCallBackend(CeedDestroy(&ceed)); + return CEED_ERROR_SUCCESS; +} + +static int CeedBasisApplyNonTensor_Cuda_shared(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, + CeedVector u, CeedVector v) { + CeedCallBackend(CeedBasisApplyNonTensorCore_Cuda_shared(basis, false, num_elem, t_mode, eval_mode, u, v)); + return CEED_ERROR_SUCCESS; +} + +static int CeedBasisApplyAddNonTensor_Cuda_shared(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, + CeedVector u, CeedVector v) { + CeedCallBackend(CeedBasisApplyNonTensorCore_Cuda_shared(basis, true, num_elem, t_mode, eval_mode, u, v)); return CEED_ERROR_SUCCESS; } @@ -195,11 +575,16 @@ static int CeedBasisDestroy_Cuda_shared(CeedBasis basis) { CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); CeedCallBackend(CeedBasisGetData(basis, &data)); CeedCallCuda(ceed, cuModuleUnload(data->module)); - CeedCallCuda(ceed, cudaFree(data->d_q_weight_1d)); + if (data->moduleAtPoints) CeedCallCuda(ceed, cuModuleUnload(data->moduleAtPoints)); + if (data->d_q_weight_1d) CeedCallCuda(ceed, cudaFree(data->d_q_weight_1d)); + CeedCallBackend(CeedFree(&data->h_points_per_elem)); + if (data->d_points_per_elem) CeedCallCuda(ceed, cudaFree(data->d_points_per_elem)); CeedCallCuda(ceed, cudaFree(data->d_interp_1d)); CeedCallCuda(ceed, cudaFree(data->d_grad_1d)); CeedCallCuda(ceed, cudaFree(data->d_collo_grad_1d)); + CeedCallCuda(ceed, cudaFree(data->d_chebyshev_interp_1d)); CeedCallBackend(CeedFree(&data)); + CeedCallBackend(CeedDestroy(&ceed)); return CEED_ERROR_SUCCESS; } @@ -209,8 +594,6 @@ static int CeedBasisDestroy_Cuda_shared(CeedBasis basis) { int CeedBasisCreateTensorH1_Cuda_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d, const CeedScalar *q_ref_1d, const CeedScalar *q_weight_1d, CeedBasis basis) { Ceed ceed; - char *basis_kernel_source; - const char *basis_kernel_path; CeedInt num_comp; const CeedInt q_bytes = Q_1d * sizeof(CeedScalar); const CeedInt interp_bytes = q_bytes * P_1d; @@ -220,8 +603,10 @@ int CeedBasisCreateTensorH1_Cuda_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, CeedCallBackend(CeedCalloc(1, &data)); // Copy basis data to GPU - CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight_1d, q_bytes)); - CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight_1d, q_weight_1d, q_bytes, cudaMemcpyHostToDevice)); + if (q_weight_1d) { + CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight_1d, q_bytes)); + CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight_1d, q_weight_1d, q_bytes, cudaMemcpyHostToDevice)); + } CeedCallCuda(ceed, cudaMalloc((void **)&data->d_interp_1d, interp_bytes)); CeedCallCuda(ceed, cudaMemcpy(data->d_interp_1d, interp_1d, interp_bytes, cudaMemcpyHostToDevice)); CeedCallCuda(ceed, cudaMalloc((void **)&data->d_grad_1d, interp_bytes)); @@ -242,27 +627,103 @@ int CeedBasisCreateTensorH1_Cuda_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, } // Compile basis kernels + bool is_collocated = false; + const char basis_kernel_source[] = "// Tensor basis source\n#include \n"; + CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); - CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-shared-basis-tensor.h", &basis_kernel_path)); - CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n"); - CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source)); - CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete -----\n"); - CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->module, 8, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "T_1D", + CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->module, 8, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "BASIS_T_1D", CeedIntMax(Q_1d, P_1d), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp, "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim), "BASIS_HAS_COLLOCATED_GRAD", has_collocated_grad)); + CeedCallBackend(CeedBasisIsCollocated(basis, &is_collocated)); + CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, is_collocated ? "InterpCollocated" : "Interp", &data->Interp)); + CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, is_collocated ? "InterpCollocatedTranspose" : "InterpTranspose", &data->InterpTranspose)); + CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, is_collocated ? "InterpCollocatedTransposeAdd" : "InterpTransposeAdd", + &data->InterpTransposeAdd)); + CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, is_collocated ? "GradCollocated" : "Grad", &data->Grad)); + CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, is_collocated ? "GradCollocatedTranspose" : "GradTranspose", &data->GradTranspose)); + CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, is_collocated ? "GradCollocatedTransposeAdd" : "GradTransposeAdd", &data->GradTransposeAdd)); + CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Weight", &data->Weight)); + + CeedCallBackend(CeedBasisSetData(basis, data)); + + // Register backend functions + CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyTensor_Cuda_shared)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddTensor_Cuda_shared)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAtPoints", CeedBasisApplyAtPoints_Cuda_shared)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAddAtPoints", CeedBasisApplyAddAtPoints_Cuda_shared)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Cuda_shared)); + CeedCallBackend(CeedDestroy(&ceed)); + return CEED_ERROR_SUCCESS; +} + +//------------------------------------------------------------------------------ +// Create non-tensor basis +//------------------------------------------------------------------------------ +int CeedBasisCreateH1_Cuda_shared(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, + const CeedScalar *grad, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) { + Ceed ceed; + CeedInt num_comp, q_comp_interp, q_comp_grad; + const CeedInt q_bytes = num_qpts * sizeof(CeedScalar); + CeedBasis_Cuda_shared *data; + + CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); + + // Check shared memory size + { + Ceed_Cuda *cuda_data; + + CeedCallBackend(CeedGetData(ceed, &cuda_data)); + if (((size_t)num_nodes * (size_t)num_qpts * (size_t)dim + (size_t)CeedIntMax(num_nodes, num_qpts)) * sizeof(CeedScalar) > + cuda_data->device_prop.sharedMemPerBlock) { + CeedCallBackend(CeedBasisCreateH1Fallback(ceed, topo, dim, num_nodes, num_qpts, interp, grad, q_ref, q_weight, basis)); + CeedCallBackend(CeedDestroy(&ceed)); + return CEED_ERROR_SUCCESS; + } + } + + CeedCallBackend(CeedCalloc(1, &data)); + + // Copy basis data to GPU + CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp)); + CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_GRAD, &q_comp_grad)); + if (q_weight) { + CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight_1d, q_bytes)); + CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight_1d, q_weight, q_bytes, cudaMemcpyHostToDevice)); + } + if (interp) { + const CeedInt interp_bytes = q_bytes * num_nodes * q_comp_interp; + + CeedCallCuda(ceed, cudaMalloc((void **)&data->d_interp_1d, interp_bytes)); + CeedCallCuda(ceed, cudaMemcpy(data->d_interp_1d, interp, interp_bytes, cudaMemcpyHostToDevice)); + } + if (grad) { + const CeedInt grad_bytes = q_bytes * num_nodes * q_comp_grad; + + CeedCallCuda(ceed, cudaMalloc((void **)&data->d_grad_1d, grad_bytes)); + CeedCallCuda(ceed, cudaMemcpy(data->d_grad_1d, grad, grad_bytes, cudaMemcpyHostToDevice)); + } + + // Compile basis kernels + const char basis_kernel_source[] = "// Non-tensor basis source\n#include \n"; + + CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); + CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->module, 5, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "BASIS_T_1D", + CeedIntMax(num_qpts, num_nodes), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp)); CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Interp", &data->Interp)); CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "InterpTranspose", &data->InterpTranspose)); + CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "InterpTransposeAdd", &data->InterpTransposeAdd)); CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Grad", &data->Grad)); CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "GradTranspose", &data->GradTranspose)); + CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "GradTransposeAdd", &data->GradTransposeAdd)); CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Weight", &data->Weight)); - CeedCallBackend(CeedFree(&basis_kernel_path)); - CeedCallBackend(CeedFree(&basis_kernel_source)); CeedCallBackend(CeedBasisSetData(basis, data)); // Register backend functions - CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyTensor_Cuda_shared)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Cuda_shared)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Cuda_shared)); CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Cuda_shared)); + CeedCallBackend(CeedDestroy(&ceed)); return CEED_ERROR_SUCCESS; } diff --git a/backends/cuda-shared/ceed-cuda-shared.c b/backends/cuda-shared/ceed-cuda-shared.c index ef704f7193..1224032995 100644 --- a/backends/cuda-shared/ceed-cuda-shared.c +++ b/backends/cuda-shared/ceed-cuda-shared.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -24,6 +24,7 @@ static int CeedInit_Cuda_shared(const char *resource, Ceed ceed) { CeedCallBackend(CeedGetResourceRoot(ceed, resource, ":", &resource_root)); CeedCheck(!strcmp(resource_root, "/gpu/cuda/shared"), ceed, CEED_ERROR_BACKEND, "Cuda backend cannot use resource: %s", resource); + CeedCallBackend(CeedFree(&resource_root)); CeedCallBackend(CeedSetDeterministic(ceed, true)); CeedCallBackend(CeedCalloc(1, &data)); @@ -32,8 +33,10 @@ static int CeedInit_Cuda_shared(const char *resource, Ceed ceed) { CeedCallBackend(CeedInit("/gpu/cuda/ref", &ceed_ref)); CeedCallBackend(CeedSetDelegate(ceed, ceed_ref)); + CeedCallBackend(CeedDestroy(&ceed_ref)); CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "BasisCreateTensorH1", CeedBasisCreateTensorH1_Cuda_shared)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "BasisCreateH1", CeedBasisCreateH1_Cuda_shared)); CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "Destroy", CeedDestroy_Cuda)); return CEED_ERROR_SUCCESS; } diff --git a/backends/cuda-shared/ceed-cuda-shared.h b/backends/cuda-shared/ceed-cuda-shared.h index ffc70dd6f5..7d67327789 100644 --- a/backends/cuda-shared/ceed-cuda-shared.h +++ b/backends/cuda-shared/ceed-cuda-shared.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -14,16 +14,31 @@ typedef struct { CUmodule module; CUfunction Interp; CUfunction InterpTranspose; + CUfunction InterpTransposeAdd; CUfunction Grad; CUfunction GradTranspose; + CUfunction GradTransposeAdd; CUfunction Weight; + CUmodule moduleAtPoints; + CeedInt num_points; + CUfunction InterpAtPoints; + CUfunction InterpTransposeAtPoints; + CUfunction InterpTransposeAddAtPoints; + CUfunction GradAtPoints; + CUfunction GradTransposeAtPoints; + CUfunction GradTransposeAddAtPoints; CeedScalar *d_interp_1d; CeedScalar *d_grad_1d; CeedScalar *d_collo_grad_1d; CeedScalar *d_q_weight_1d; - CeedScalar *c_B; - CeedScalar *c_G; + CeedScalar *d_chebyshev_interp_1d; + CeedInt num_elem_at_points; + CeedInt *h_points_per_elem; + CeedInt *d_points_per_elem; } CeedBasis_Cuda_shared; CEED_INTERN int CeedBasisCreateTensorH1_Cuda_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d, const CeedScalar *q_ref_1d, const CeedScalar *q_weight_1d, CeedBasis basis); + +CEED_INTERN int CeedBasisCreateH1_Cuda_shared(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, + const CeedScalar *grad, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis); diff --git a/backends/cuda-shared/kernels/cuda-shared-basis.cu b/backends/cuda-shared/kernels/cuda-shared-basis.cu deleted file mode 100644 index 3374cd8bb8..0000000000 --- a/backends/cuda-shared/kernels/cuda-shared-basis.cu +++ /dev/null @@ -1,56 +0,0 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. -// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. -// -// SPDX-License-Identifier: BSD-2-Clause -// -// This file is part of CEED: http://github.com/ceed - -#include -#include - -const int sizeMax = 16; -__constant__ CeedScalar c_B[sizeMax*sizeMax]; -__constant__ CeedScalar c_G[sizeMax*sizeMax]; - -//------------------------------------------------------------------------------ -// Interp device initalization -//------------------------------------------------------------------------------ -extern "C" int CeedInit_CudaInterp(CeedScalar *d_B, CeedInt P_1d, CeedInt Q_1d, - CeedScalar **c_B_ptr) { - const int bytes = P_1d*Q_1d*sizeof(CeedScalar); - - cudaMemcpyToSymbol(c_B, d_B, bytes, 0, cudaMemcpyDeviceToDevice); - cudaGetSymbolAddress((void **)c_B_ptr, c_B); - return CEED_ERROR_SUCCESS; -} - -//------------------------------------------------------------------------------ -// Grad device initalization -//------------------------------------------------------------------------------ -extern "C" int CeedInit_CudaGrad(CeedScalar *d_B, CeedScalar *d_G, - CeedInt P_1d, CeedInt Q_1d, CeedScalar **c_B_ptr, CeedScalar **c_G_ptr) { - const int bytes = P_1d*Q_1d*sizeof(CeedScalar); - - cudaMemcpyToSymbol(c_B, d_B, bytes, 0, cudaMemcpyDeviceToDevice); - cudaGetSymbolAddress((void **)c_B_ptr, c_B); - cudaMemcpyToSymbol(c_G, d_G, bytes, 0, cudaMemcpyDeviceToDevice); - cudaGetSymbolAddress((void **)c_G_ptr, c_G); - return CEED_ERROR_SUCCESS; -} - -//------------------------------------------------------------------------------ -// Collocated grad device initalization -//------------------------------------------------------------------------------ -extern "C" int CeedInit_CudaCollocatedGrad(CeedScalar *d_B, CeedScalar *d_G, - CeedInt P_1d, CeedInt Q_1d, CeedScalar **c_B_ptr, CeedScalar **c_G_ptr) { - const int bytes_interp = P_1d*Q_1d*sizeof(CeedScalar); - const int bytes_grad = Q_1d*Q_1d*sizeof(CeedScalar); - - cudaMemcpyToSymbol(c_B, d_B, bytes_interp, 0, cudaMemcpyDeviceToDevice); - cudaGetSymbolAddress((void **)c_B_ptr, c_B); - cudaMemcpyToSymbol(c_G, d_G, bytes_grad, 0, cudaMemcpyDeviceToDevice); - cudaGetSymbolAddress((void **)c_G_ptr, c_G); - return CEED_ERROR_SUCCESS; -} - -//------------------------------------------------------------------------------ diff --git a/backends/cuda/ceed-cuda-common.c b/backends/cuda/ceed-cuda-common.c index cae17d11d5..9538a2ee4d 100644 --- a/backends/cuda/ceed-cuda-common.c +++ b/backends/cuda/ceed-cuda-common.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -53,10 +53,15 @@ static inline int CeedSetDeviceGenericArray_Cuda(Ceed ceed, const void *source_a void *target_array_owned, void *target_array_borrowed, void *target_array) { switch (copy_mode) { case CEED_COPY_VALUES: - if (!*(void **)target_array_owned) CeedCallCuda(ceed, cudaMalloc(target_array_owned, size_unit * num_values)); - if (source_array) CeedCallCuda(ceed, cudaMemcpy(*(void **)target_array_owned, source_array, size_unit * num_values, cudaMemcpyDeviceToDevice)); - *(void **)target_array_borrowed = NULL; - *(void **)target_array = *(void **)target_array_owned; + if (!*(void **)target_array) { + if (*(void **)target_array_borrowed) { + *(void **)target_array = *(void **)target_array_borrowed; + } else { + if (!*(void **)target_array_owned) CeedCallCuda(ceed, cudaMalloc(target_array_owned, size_unit * num_values)); + *(void **)target_array = *(void **)target_array_owned; + } + } + if (source_array) CeedCallCuda(ceed, cudaMemcpy(*(void **)target_array, source_array, size_unit * num_values, cudaMemcpyDeviceToDevice)); break; case CEED_OWN_POINTER: CeedCallCuda(ceed, cudaFree(*(void **)target_array_owned)); diff --git a/backends/cuda/ceed-cuda-common.h b/backends/cuda/ceed-cuda-common.h index 1fc8362717..489374a29a 100644 --- a/backends/cuda/ceed-cuda-common.h +++ b/backends/cuda/ceed-cuda-common.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -66,6 +66,8 @@ static const char *cublasGetErrorName(cublasStatus_t error) { typedef struct { int device_id; + bool use_llvm_version; + int llvm_version; cublasHandle_t cublas_handle; struct cudaDeviceProp device_prop; } Ceed_Cuda; diff --git a/backends/cuda/ceed-cuda-compile.cpp b/backends/cuda/ceed-cuda-compile.cpp index b8186ba2d1..d1593dd800 100644 --- a/backends/cuda/ceed-cuda-compile.cpp +++ b/backends/cuda/ceed-cuda-compile.cpp @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -11,11 +11,19 @@ #include #include #include +#include #include #include +#include #include +#include +#include +#include +#include +#include #include +#include #include "ceed-cuda-common.h" @@ -31,15 +39,42 @@ CeedChk_Nvrtc(ceed, ierr_q_); \ } while (0) +#define CeedCallSystem(ceed, command, message) CeedCallBackend(CeedCallSystem_Core(ceed, command, message)) + +//------------------------------------------------------------------------------ +// Call system command and capture stdout + stderr +//------------------------------------------------------------------------------ +static int CeedCallSystem_Core(Ceed ceed, const char *command, const char *message) { + CeedDebug(ceed, "Running command:\n$ %s", command); + FILE *output_stream = popen((command + std::string(" 2>&1")).c_str(), "r"); + + CeedCheck(output_stream != nullptr, ceed, CEED_ERROR_BACKEND, "Failed to %s\ncommand:\n$ %s", message, command); + + char line[CEED_MAX_RESOURCE_LEN] = ""; + std::string output = ""; + + while (fgets(line, sizeof(line), output_stream) != nullptr) { + output += line; + } + CeedDebug(ceed, "output:\n%s\n", output.c_str()); + CeedCheck(pclose(output_stream) == 0, ceed, CEED_ERROR_BACKEND, "Failed to %s\ncommand:\n$ %s\nerror:\n%s", message, command, output.c_str()); + return CEED_ERROR_SUCCESS; +} + //------------------------------------------------------------------------------ // Compile CUDA kernel //------------------------------------------------------------------------------ -int CeedCompile_Cuda(Ceed ceed, const char *source, CUmodule *module, const CeedInt num_defines, ...) { +using std::ifstream; +using std::ofstream; +using std::ostringstream; + +static int CeedCompileCore_Cuda(Ceed ceed, const char *source, const bool throw_error, bool *is_compile_good, CUmodule *module, + const CeedInt num_defines, va_list args) { size_t ptx_size; char *ptx; - const char *jit_defs_path, *jit_defs_source; - const int num_opts = 3; - const char *opts[num_opts]; + const int num_opts = 4; + CeedInt num_jit_source_dirs = 0, num_jit_defines = 0; + const char **opts; nvrtcProgram prog; struct cudaDeviceProp prop; Ceed_Cuda *ceed_data; @@ -47,11 +82,17 @@ int CeedCompile_Cuda(Ceed ceed, const char *source, CUmodule *module, const Ceed cudaFree(0); // Make sure a Context exists for nvrtc std::ostringstream code; + bool using_clang; + + CeedCallBackend(CeedGetIsClang(ceed, &using_clang)); + + CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, + using_clang ? "Compiling CUDA with Clang backend (with Rust QFunction support)" + : "Compiling CUDA with NVRTC backend (without Rust QFunction support).\nTo use the Clang backend, set the environment " + "variable GPU_CLANG=1"); // Get kernel specific options, such as kernel constants if (num_defines > 0) { - va_list args; - va_start(args, num_defines); char *name; int val; @@ -60,59 +101,329 @@ int CeedCompile_Cuda(Ceed ceed, const char *source, CUmodule *module, const Ceed val = va_arg(args, int); code << "#define " << name << " " << val << "\n"; } - va_end(args); } // Standard libCEED definitions for CUDA backends - CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-jit.h", &jit_defs_path)); - { - char *source; - - CeedCallBackend(CeedLoadSourceToBuffer(ceed, jit_defs_path, &source)); - jit_defs_source = source; - } - code << jit_defs_source; - code << "\n\n"; - CeedCallBackend(CeedFree(&jit_defs_path)); - CeedCallBackend(CeedFree(&jit_defs_source)); + code << "#include \n\n"; // Non-macro options + CeedCallBackend(CeedCalloc(num_opts, &opts)); opts[0] = "-default-device"; CeedCallBackend(CeedGetData(ceed, &ceed_data)); CeedCallCuda(ceed, cudaGetDeviceProperties(&prop, ceed_data->device_id)); - std::string arch_arg = "-arch=compute_" + std::to_string(prop.major) + std::to_string(prop.minor); - opts[1] = arch_arg.c_str(); - opts[2] = "-Dint32_t=int"; + std::string arch_arg = +#if CUDA_VERSION >= 11010 + // NVRTC used to support only virtual architectures through the option + // -arch, since it was only emitting PTX. It will now support actual + // architectures as well to emit SASS. + // https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#dynamic-code-generation + "-arch=sm_" +#else + "-arch=compute_" +#endif + + std::to_string(prop.major) + std::to_string(prop.minor); + opts[1] = arch_arg.c_str(); + opts[2] = "-Dint32_t=int"; + opts[3] = "-DCEED_RUNNING_JIT_PASS=1"; + // Additional include dirs + { + const char **jit_source_dirs; + + CeedCallBackend(CeedGetJitSourceRoots(ceed, &num_jit_source_dirs, &jit_source_dirs)); + CeedCallBackend(CeedRealloc(num_opts + num_jit_source_dirs, &opts)); + for (CeedInt i = 0; i < num_jit_source_dirs; i++) { + std::ostringstream include_dir_arg; + + include_dir_arg << "-I" << jit_source_dirs[i]; + CeedCallBackend(CeedStringAllocCopy(include_dir_arg.str().c_str(), (char **)&opts[num_opts + i])); + } + CeedCallBackend(CeedRestoreJitSourceRoots(ceed, &jit_source_dirs)); + } + // User defines + { + const char **jit_defines; + + CeedCallBackend(CeedGetJitDefines(ceed, &num_jit_defines, &jit_defines)); + CeedCallBackend(CeedRealloc(num_opts + num_jit_source_dirs + num_jit_defines, &opts)); + for (CeedInt i = 0; i < num_jit_defines; i++) { + std::ostringstream define_arg; + + define_arg << "-D" << jit_defines[i]; + CeedCallBackend(CeedStringAllocCopy(define_arg.str().c_str(), (char **)&opts[num_opts + num_jit_source_dirs + i])); + } + CeedCallBackend(CeedRestoreJitDefines(ceed, &jit_defines)); + } // Add string source argument provided in call code << source; - // Create Program - CeedCallNvrtc(ceed, nvrtcCreateProgram(&prog, code.str().c_str(), NULL, 0, NULL, NULL)); - // Compile kernel - nvrtcResult result = nvrtcCompileProgram(prog, num_opts, opts); - - if (result != NVRTC_SUCCESS) { - char *log; - size_t log_size; - - CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "---------- CEED JIT SOURCE FAILED TO COMPILE ----------\n"); - CeedDebug(ceed, "Source:\n%s\n", code.str().c_str()); - CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "---------- CEED JIT SOURCE FAILED TO COMPILE ----------\n"); - CeedCallNvrtc(ceed, nvrtcGetProgramLogSize(prog, &log_size)); - CeedCallBackend(CeedMalloc(log_size, &log)); - CeedCallNvrtc(ceed, nvrtcGetProgramLog(prog, log)); - return CeedError(ceed, CEED_ERROR_BACKEND, "%s\n%s", nvrtcGetErrorString(result), log); + CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "---------- ATTEMPTING TO COMPILE JIT SOURCE ----------\n"); + CeedDebug(ceed, "Source:\n%s\n", code.str().c_str()); + CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "---------- END OF JIT SOURCE ----------\n"); + + if (!using_clang) { + CeedCallNvrtc(ceed, nvrtcCreateProgram(&prog, code.str().c_str(), NULL, 0, NULL, NULL)); + + if (CeedDebugFlag(ceed)) { + // LCOV_EXCL_START + CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "---------- JiT COMPILER OPTIONS ----------\n"); + for (CeedInt i = 0; i < num_opts + num_jit_source_dirs + num_jit_defines; i++) { + CeedDebug(ceed, "Option %d: %s", i, opts[i]); + } + CeedDebug(ceed, ""); + CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "---------- END OF JiT COMPILER OPTIONS ----------\n"); + // LCOV_EXCL_STOP + } + + nvrtcResult result = nvrtcCompileProgram(prog, num_opts + num_jit_source_dirs + num_jit_defines, opts); + + for (CeedInt i = 0; i < num_jit_source_dirs; i++) { + CeedCallBackend(CeedFree(&opts[num_opts + i])); + } + for (CeedInt i = 0; i < num_jit_defines; i++) { + CeedCallBackend(CeedFree(&opts[num_opts + num_jit_source_dirs + i])); + } + CeedCallBackend(CeedFree(&opts)); + *is_compile_good = result == NVRTC_SUCCESS; + if (!*is_compile_good) { + char *log; + size_t log_size; + + CeedCallNvrtc(ceed, nvrtcGetProgramLogSize(prog, &log_size)); + CeedCallBackend(CeedMalloc(log_size, &log)); + CeedCallNvrtc(ceed, nvrtcGetProgramLog(prog, log)); + if (throw_error) { + return CeedError(ceed, CEED_ERROR_BACKEND, "%s\n%s", nvrtcGetErrorString(result), log); + } else { + // LCOV_EXCL_START + CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "---------- COMPILE ERROR DETECTED ----------\n"); + CeedDebug(ceed, "Error: %s\nCompile log:\n%s\n", nvrtcGetErrorString(result), log); + CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "---------- BACKEND MAY FALLBACK ----------\n"); + CeedCallBackend(CeedFree(&log)); + CeedCallNvrtc(ceed, nvrtcDestroyProgram(&prog)); + return CEED_ERROR_SUCCESS; + // LCOV_EXCL_STOP + } + } + +#if CUDA_VERSION >= 11010 + CeedCallNvrtc(ceed, nvrtcGetCUBINSize(prog, &ptx_size)); + CeedCallBackend(CeedMalloc(ptx_size, &ptx)); + CeedCallNvrtc(ceed, nvrtcGetCUBIN(prog, ptx)); +#else + CeedCallNvrtc(ceed, nvrtcGetPTXSize(prog, &ptx_size)); + CeedCallBackend(CeedMalloc(ptx_size, &ptx)); + CeedCallNvrtc(ceed, nvrtcGetPTX(prog, ptx)); +#endif + CeedCallNvrtc(ceed, nvrtcDestroyProgram(&prog)); + + CeedCallCuda(ceed, cuModuleLoadData(module, ptx)); + CeedCallBackend(CeedFree(&ptx)); + return CEED_ERROR_SUCCESS; + } else { + srand(time(NULL)); + const int build_id = rand(); + + // Create temp dir if needed + { + DIR *dir = opendir("temp"); + + if (dir) { + closedir(dir); + } else { + // In parallel multiple processes may attempt + // Only one process needs to succeed + mkdir("temp", 0777); + chmod("temp", 0777); + } + } + // Write code to temp file + { + std::string filename = std::string("temp/kernel_") + std::to_string(build_id) + std::string("_0_source.cu"); + FILE *file = fopen(filename.c_str(), "w"); + + CeedCheck(file, ceed, CEED_ERROR_BACKEND, "Failed to create file. Write access is required for cuda-clang"); + fputs(code.str().c_str(), file); + fclose(file); + } + + // Get rust crate directories + const char **rust_source_dirs = nullptr; + int num_rust_source_dirs = 0; + + CeedCallBackend(CeedGetRustSourceRoots(ceed, &num_rust_source_dirs, &rust_source_dirs)); + + std::string rust_dirs[10]; + + if (num_rust_source_dirs > 0) { + CeedDebug(ceed, "There are %d source dirs, including %s\n", num_rust_source_dirs, rust_source_dirs[0]); + } + + for (CeedInt i = 0; i < num_rust_source_dirs; i++) { + rust_dirs[i] = std::string(rust_source_dirs[i]); + } + + CeedCallBackend(CeedRestoreRustSourceRoots(ceed, &rust_source_dirs)); + + char *rust_toolchain = std::getenv("RUST_TOOLCHAIN"); + + if (rust_toolchain == nullptr) { + rust_toolchain = (char *)"nightly"; + setenv("RUST_TOOLCHAIN", "nightly", 0); + } + + // Compile Rust crate(s) needed + std::string command; + + for (CeedInt i = 0; i < num_rust_source_dirs; i++) { + command = "cargo +" + std::string(rust_toolchain) + " build --release --target nvptx64-nvidia-cuda --config " + rust_dirs[i] + + "/.cargo/config.toml --manifest-path " + rust_dirs[i] + "/Cargo.toml"; + CeedCallSystem(ceed, command.c_str(), "build Rust crate"); + } + + // Get Clang version + bool use_llvm_version = ceed_data->use_llvm_version; + int llvm_version = ceed_data->llvm_version; + + if (llvm_version == 0) { + command = "$(find $(rustup run " + std::string(rust_toolchain) + " rustc --print sysroot) -name llvm-link) --version"; + CeedDebug(ceed, "Attempting to detect Rust LLVM version.\ncommand:\n$ %s", command.c_str()); + FILE *output_stream = popen((command + std::string(" 2>&1")).c_str(), "r"); + + CeedCheck(output_stream != nullptr, ceed, CEED_ERROR_BACKEND, "Failed to detect Rust LLVM version"); + + char line[CEED_MAX_RESOURCE_LEN] = ""; + std::string output = ""; + + while (fgets(line, sizeof(line), output_stream) != nullptr) { + output += line; + } + CeedDebug(ceed, "output:\n%s", output.c_str()); + CeedCheck(pclose(output_stream) == 0, ceed, CEED_ERROR_BACKEND, "Failed to detect Rust LLVM version\ncommand:\n$ %s\nerror:\n%s", + command.c_str(), output.c_str()); + + const char *version_substring = strstr(output.c_str(), "LLVM version "); + + version_substring += 13; + + char *next_dot = strchr((char *)version_substring, '.'); + + if (next_dot) { + next_dot[0] = '\0'; + ceed_data->llvm_version = llvm_version = std::stoi(version_substring); + CeedDebug(ceed, "Rust LLVM version number: %d\n", llvm_version); + + command = std::string("clang++-") + std::to_string(llvm_version); + output_stream = popen((command + std::string(" 2>&1")).c_str(), "r"); + ceed_data->use_llvm_version = use_llvm_version = pclose(output_stream) == 0; + } else { + ceed_data->llvm_version = -1; + ceed_data->use_llvm_version = use_llvm_version = false; + } + } + + // Compile wrapper kernel + command = "clang++" + (use_llvm_version ? (std::string("-") + std::to_string(llvm_version)) : "") + " -flto=thin --cuda-gpu-arch=sm_" + + std::to_string(prop.major) + std::to_string(prop.minor) + " --cuda-device-only -emit-llvm -S temp/kernel_" + std::to_string(build_id) + + "_0_source.cu -o temp/kernel_" + std::to_string(build_id) + "_1_wrapped.ll "; + command += opts[4]; + CeedCallSystem(ceed, command.c_str(), "JiT kernel source"); + CeedCallSystem(ceed, ("chmod 0777 temp/kernel_" + std::to_string(build_id) + "_1_wrapped.ll").c_str(), "update JiT file permissions"); + + // Find Rust's llvm-link tool and run it + command = "$(find $(rustup run " + std::string(rust_toolchain) + " rustc --print sysroot) -name llvm-link) temp/kernel_" + + std::to_string(build_id) + + "_1_wrapped.ll --ignore-non-bitcode --internalize --only-needed -S -o " + "temp/kernel_" + + std::to_string(build_id) + "_2_linked.ll "; + + // Searches for .a files in Rust directory + // Note: Rust crate names may not match the folder they are in + // TODO: If libCEED switches to c++17, use std::filesystem here + for (CeedInt i = 0; i < num_rust_source_dirs; i++) { + std::string dir = rust_dirs[i] + "/target/nvptx64-nvidia-cuda/release"; + DIR *dp = opendir(dir.c_str()); + + CeedCheck(dp != nullptr, ceed, CEED_ERROR_BACKEND, "Could not open directory: %s", dir.c_str()); + struct dirent *entry; + + // Find files ending in .a + while ((entry = readdir(dp)) != nullptr) { + std::string filename(entry->d_name); + + if (filename.size() >= 2 && filename.substr(filename.size() - 2) == ".a") { + command += dir + "/" + filename + " "; + } + } + closedir(dp); + } + + // Link, optimize, and compile final CUDA kernel + CeedCallSystem(ceed, command.c_str(), "link C and Rust source"); + CeedCallSystem(ceed, + ("$(find $(rustup run " + std::string(rust_toolchain) + + " rustc --print sysroot) -name opt) --passes internalize,inline temp/kernel_" + std::to_string(build_id) + + "_2_linked.ll -o temp/kernel_" + std::to_string(build_id) + "_3_opt.bc") + .c_str(), + "optimize linked C and Rust source"); + CeedCallSystem(ceed, ("chmod 0777 temp/kernel_" + std::to_string(build_id) + "_2_linked.ll").c_str(), "update JiT file permissions"); + CeedCallSystem(ceed, + ("$(find $(rustup run " + std::string(rust_toolchain) + " rustc --print sysroot) -name llc) -O3 -mcpu=sm_" + + std::to_string(prop.major) + std::to_string(prop.minor) + " temp/kernel_" + std::to_string(build_id) + + "_3_opt.bc -o temp/kernel_" + std::to_string(build_id) + "_4_final.ptx") + .c_str(), + "compile final CUDA kernel"); + CeedCallSystem(ceed, ("chmod 0777 temp/kernel_" + std::to_string(build_id) + "_4_final.ptx").c_str(), "update JiT file permissions"); + + // Load module from final PTX + ifstream ptxfile("temp/kernel_" + std::to_string(build_id) + "_4_final.ptx"); + ostringstream sstr; + + sstr << ptxfile.rdbuf(); + + auto ptx_data = sstr.str(); + ptx_size = ptx_data.length(); + + int result = cuModuleLoadData(module, ptx_data.c_str()); + + *is_compile_good = result == 0; + if (!*is_compile_good) { + if (throw_error) { + return CeedError(ceed, CEED_ERROR_BACKEND, "Failed to load module data"); + } else { + // LCOV_EXCL_START + CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "---------- COMPILE ERROR DETECTED ----------\n"); + CeedDebug(ceed, "Error: Failed to load module data"); + CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "---------- BACKEND MAY FALLBACK ----------\n"); + return CEED_ERROR_SUCCESS; + // LCOV_EXCL_STOP + } + } } + return CEED_ERROR_SUCCESS; +} + +int CeedCompile_Cuda(Ceed ceed, const char *source, CUmodule *module, const CeedInt num_defines, ...) { + bool is_compile_good = true; + va_list args; + + va_start(args, num_defines); + const CeedInt ierr = CeedCompileCore_Cuda(ceed, source, true, &is_compile_good, module, num_defines, args); + + va_end(args); + CeedCallBackend(ierr); + return CEED_ERROR_SUCCESS; +} - CeedCallNvrtc(ceed, nvrtcGetPTXSize(prog, &ptx_size)); - CeedCallBackend(CeedMalloc(ptx_size, &ptx)); - CeedCallNvrtc(ceed, nvrtcGetPTX(prog, ptx)); - CeedCallNvrtc(ceed, nvrtcDestroyProgram(&prog)); +int CeedTryCompile_Cuda(Ceed ceed, const char *source, bool *is_compile_good, CUmodule *module, const CeedInt num_defines, ...) { + va_list args; - CeedCallCuda(ceed, cuModuleLoadData(module, ptx)); - CeedCallBackend(CeedFree(&ptx)); + va_start(args, num_defines); + const CeedInt ierr = CeedCompileCore_Cuda(ceed, source, false, is_compile_good, module, num_defines, args); + + va_end(args); + CeedCallBackend(ierr); return CEED_ERROR_SUCCESS; } @@ -141,7 +452,7 @@ int CeedRunKernelAutoblockCuda(Ceed ceed, CUfunction kernel, size_t points, void // Run CUDA kernel //------------------------------------------------------------------------------ int CeedRunKernel_Cuda(Ceed ceed, CUfunction kernel, const int grid_size, const int block_size, void **args) { - CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, kernel, grid_size, block_size, 1, 1, 0, args)); + CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, kernel, NULL, grid_size, block_size, 1, 1, 0, args)); return CEED_ERROR_SUCCESS; } @@ -150,19 +461,20 @@ int CeedRunKernel_Cuda(Ceed ceed, CUfunction kernel, const int grid_size, const //------------------------------------------------------------------------------ int CeedRunKernelDim_Cuda(Ceed ceed, CUfunction kernel, const int grid_size, const int block_size_x, const int block_size_y, const int block_size_z, void **args) { - CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, kernel, grid_size, block_size_x, block_size_y, block_size_z, 0, args)); + CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, kernel, NULL, grid_size, block_size_x, block_size_y, block_size_z, 0, args)); return CEED_ERROR_SUCCESS; } //------------------------------------------------------------------------------ // Run CUDA kernel for spatial dimension with shared memory //------------------------------------------------------------------------------ -int CeedRunKernelDimShared_Cuda(Ceed ceed, CUfunction kernel, const int grid_size, const int block_size_x, const int block_size_y, - const int block_size_z, const int shared_mem_size, void **args) { +static int CeedRunKernelDimSharedCore_Cuda(Ceed ceed, CUfunction kernel, CUstream stream, const int grid_size, const int block_size_x, + const int block_size_y, const int block_size_z, const int shared_mem_size, const bool throw_error, + bool *is_good_run, void **args) { #if CUDA_VERSION >= 9000 cuFuncSetAttribute(kernel, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, shared_mem_size); #endif - CUresult result = cuLaunchKernel(kernel, grid_size, 1, 1, block_size_x, block_size_y, block_size_z, shared_mem_size, NULL, args, NULL); + CUresult result = cuLaunchKernel(kernel, grid_size, 1, 1, block_size_x, block_size_y, block_size_z, shared_mem_size, stream, args, NULL); if (result == CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES) { int max_threads_per_block, shared_size_bytes, num_regs; @@ -170,11 +482,37 @@ int CeedRunKernelDimShared_Cuda(Ceed ceed, CUfunction kernel, const int grid_siz cuFuncGetAttribute(&max_threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, kernel); cuFuncGetAttribute(&shared_size_bytes, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, kernel); cuFuncGetAttribute(&num_regs, CU_FUNC_ATTRIBUTE_NUM_REGS, kernel); - return CeedError(ceed, CEED_ERROR_BACKEND, - "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES: max_threads_per_block %d on block size (%d,%d,%d), shared_size %d, num_regs %d", - max_threads_per_block, block_size_x, block_size_y, block_size_z, shared_size_bytes, num_regs); + if (throw_error) { + return CeedError(ceed, CEED_ERROR_BACKEND, + "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES: max_threads_per_block %d on block size (%d,%d,%d), shared_size %d, num_regs %d", + max_threads_per_block, block_size_x, block_size_y, block_size_z, shared_size_bytes, num_regs); + } else { + // LCOV_EXCL_START + CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "---------- LAUNCH ERROR DETECTED ----------\n"); + CeedDebug(ceed, "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES: max_threads_per_block %d on block size (%d,%d,%d), shared_size %d, num_regs %d\n", + max_threads_per_block, block_size_x, block_size_y, block_size_z, shared_size_bytes, num_regs); + CeedDebug256(ceed, CEED_DEBUG_COLOR_WARNING, "---------- BACKEND MAY FALLBACK ----------\n"); + // LCOV_EXCL_STOP + } + *is_good_run = false; } else CeedChk_Cu(ceed, result); return CEED_ERROR_SUCCESS; } +int CeedRunKernelDimShared_Cuda(Ceed ceed, CUfunction kernel, CUstream stream, const int grid_size, const int block_size_x, const int block_size_y, + const int block_size_z, const int shared_mem_size, void **args) { + bool is_good_run = true; + + CeedCallBackend(CeedRunKernelDimSharedCore_Cuda(ceed, kernel, stream, grid_size, block_size_x, block_size_y, block_size_z, shared_mem_size, true, + &is_good_run, args)); + return CEED_ERROR_SUCCESS; +} + +int CeedTryRunKernelDimShared_Cuda(Ceed ceed, CUfunction kernel, CUstream stream, const int grid_size, const int block_size_x, const int block_size_y, + const int block_size_z, const int shared_mem_size, bool *is_good_run, void **args) { + CeedCallBackend(CeedRunKernelDimSharedCore_Cuda(ceed, kernel, stream, grid_size, block_size_x, block_size_y, block_size_z, shared_mem_size, false, + is_good_run, args)); + return CEED_ERROR_SUCCESS; +} + //------------------------------------------------------------------------------ diff --git a/backends/cuda/ceed-cuda-compile.h b/backends/cuda/ceed-cuda-compile.h index 846de28c9d..151f0e0a24 100644 --- a/backends/cuda/ceed-cuda-compile.h +++ b/backends/cuda/ceed-cuda-compile.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -13,6 +13,7 @@ static inline CeedInt CeedDivUpInt(CeedInt numerator, CeedInt denominator) { return (numerator + denominator - 1) / denominator; } CEED_INTERN int CeedCompile_Cuda(Ceed ceed, const char *source, CUmodule *module, const CeedInt num_defines, ...); +CEED_INTERN int CeedTryCompile_Cuda(Ceed ceed, const char *source, bool *is_compile_good, CUmodule *module, const CeedInt num_defines, ...); CEED_INTERN int CeedGetKernel_Cuda(Ceed ceed, CUmodule module, const char *name, CUfunction *kernel); @@ -22,5 +23,7 @@ CEED_INTERN int CeedRunKernelAutoblockCuda(Ceed ceed, CUfunction kernel, size_t CEED_INTERN int CeedRunKernelDim_Cuda(Ceed ceed, CUfunction kernel, int grid_size, int block_size_x, int block_size_y, int block_size_z, void **args); -CEED_INTERN int CeedRunKernelDimShared_Cuda(Ceed ceed, CUfunction kernel, int grid_size, int block_size_x, int block_size_y, int block_size_z, - int shared_mem_size, void **args); +CEED_INTERN int CeedRunKernelDimShared_Cuda(Ceed ceed, CUfunction kernel, CUstream stream, int grid_size, int block_size_x, int block_size_y, + int block_size_z, int shared_mem_size, void **args); +CEED_INTERN int CeedTryRunKernelDimShared_Cuda(Ceed ceed, CUfunction kernel, CUstream stream, int grid_size, int block_size_x, int block_size_y, + int block_size_z, int shared_mem_size, bool *is_good_run, void **args); diff --git a/backends/hip-gen/ceed-hip-gen-operator-build.cpp b/backends/hip-gen/ceed-hip-gen-operator-build.cpp index c4878a5fed..d2261d6f1b 100644 --- a/backends/hip-gen/ceed-hip-gen-operator-build.cpp +++ b/backends/hip-gen/ceed-hip-gen-operator-build.cpp @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -9,8 +9,10 @@ #include #include +#include #include +#include #include #include #include @@ -21,370 +23,494 @@ #include "../hip/ceed-hip-compile.h" #include "ceed-hip-gen.h" +struct FieldReuse_Hip { + CeedInt index; + bool is_input; + CeedEvalMode eval_mode; +}; + //------------------------------------------------------------------------------ // Calculate the block size used for launching the operator kernel //------------------------------------------------------------------------------ extern "C" int BlockGridCalculate_Hip_gen(const CeedInt dim, const CeedInt num_elem, const CeedInt P_1d, const CeedInt Q_1d, CeedInt *block_sizes) { - const CeedInt thread1d = CeedIntMax(Q_1d, P_1d); + const CeedInt thread_1d = CeedIntMax(Q_1d, P_1d); if (dim == 1) { - CeedInt elems_per_block = 64 * thread1d > 256 ? 256 / thread1d : 64; + CeedInt elems_per_block = 64 * thread_1d > 256 ? 256 / thread_1d : 64; elems_per_block = elems_per_block > 0 ? elems_per_block : 1; - block_sizes[0] = thread1d; + block_sizes[0] = thread_1d; block_sizes[1] = 1; block_sizes[2] = elems_per_block; } else if (dim == 2) { - const CeedInt elems_per_block = thread1d < 4 ? 16 : 2; + const CeedInt elems_per_block = thread_1d < 4 ? 16 : 2; - block_sizes[0] = thread1d; - block_sizes[1] = thread1d; + block_sizes[0] = thread_1d; + block_sizes[1] = thread_1d; block_sizes[2] = elems_per_block; } else if (dim == 3) { - const CeedInt elems_per_block = thread1d < 6 ? 4 : (thread1d < 8 ? 2 : 1); + const CeedInt elems_per_block = thread_1d < 6 ? 4 : (thread_1d < 8 ? 2 : 1); - block_sizes[0] = thread1d; - block_sizes[1] = thread1d; + block_sizes[0] = thread_1d; + block_sizes[1] = thread_1d; block_sizes[2] = elems_per_block; } return CEED_ERROR_SUCCESS; } //------------------------------------------------------------------------------ -// Build single operator kernel +// Determine type of operator //------------------------------------------------------------------------------ -extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op) { - using std::ostringstream; - using std::string; - - Ceed ceed; - bool is_setup_done, is_identity_qf; - CeedSize l_size; - CeedInt Q, P_1d = 0, Q_1d = 0, elem_size, num_input_fields, num_output_fields, num_comp, dim = 1; - CeedEvalMode eval_mode; - CeedElemRestriction elem_rstr; - CeedElemRestriction_Hip *rstr_data; - CeedBasis basis; - CeedBasis_Hip_shared *basis_data; - CeedQFunctionField *qf_input_fields, *qf_output_fields; - CeedQFunction_Hip_gen *qf_data; - CeedQFunction qf; - CeedOperatorField *op_input_fields, *op_output_fields; - CeedOperator_Hip_gen *data; - - CeedCallBackend(CeedOperatorIsSetupDone(op, &is_setup_done)); - if (is_setup_done) return CEED_ERROR_SUCCESS; - - CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); - CeedCallBackend(CeedOperatorGetData(op, &data)); - CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); - CeedCallBackend(CeedQFunctionGetData(qf, &qf_data)); - CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q)); - Q_1d = Q; - CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); - CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); +static int CeedOperatorBuildKernelData_Hip_gen(Ceed ceed, CeedInt num_input_fields, CeedOperatorField *op_input_fields, + CeedQFunctionField *qf_input_fields, CeedInt num_output_fields, CeedOperatorField *op_output_fields, + CeedQFunctionField *qf_output_fields, CeedInt *max_P, CeedInt *max_P_1d, CeedInt *Q, CeedInt *Q_1d, + CeedInt *max_dim, bool *is_all_tensor, bool *use_3d_slices) { + // Check if all are tensor + *is_all_tensor = true; + for (CeedInt i = 0; i < num_input_fields; i++) { + CeedBasis basis; - // TODO: put in a function? - // Check for restriction only identity operator - CeedCallBackend(CeedQFunctionIsIdentity(qf, &is_identity_qf)); - if (is_identity_qf) { - CeedEvalMode eval_mode_in, eval_mode_out; + CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); + if (basis != CEED_BASIS_NONE) { + bool is_field_tensor; - CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[0], &eval_mode_in)); - CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[0], &eval_mode_out)); - CeedCheck(eval_mode_in != CEED_EVAL_NONE || eval_mode_out != CEED_EVAL_NONE, ceed, CEED_ERROR_BACKEND, - "Backend does not implement restriction only identity operators"); + CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor)); + *is_all_tensor = *is_all_tensor && is_field_tensor; + } + CeedCallBackend(CeedBasisDestroy(&basis)); } + for (CeedInt i = 0; i < num_output_fields; i++) { + CeedBasis basis; - ostringstream code; - - // Load basis source files - // TODO: generalize to accept different device functions? - { - char *tensor_basis_kernel_source; - const char *tensor_basis_kernel_path; - - CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-shared-basis-tensor-templates.h", &tensor_basis_kernel_path)); - CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Tensor Basis Kernel Source -----\n"); - CeedCallBackend(CeedLoadSourceToBuffer(ceed, tensor_basis_kernel_path, &tensor_basis_kernel_source)); - code << tensor_basis_kernel_source; - CeedCallBackend(CeedFree(&tensor_basis_kernel_path)); - CeedCallBackend(CeedFree(&tensor_basis_kernel_source)); - } - { - char *hip_gen_template_source; - const char *hip_gen_template_path; + CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); + if (basis != CEED_BASIS_NONE) { + bool is_field_tensor; - CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-gen-templates.h", &hip_gen_template_path)); - CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Hip-Gen Template Source -----\n"); - CeedCallBackend(CeedLoadSourceToBuffer(ceed, hip_gen_template_path, &hip_gen_template_source)); - code << hip_gen_template_source; - CeedCallBackend(CeedFree(&hip_gen_template_path)); - CeedCallBackend(CeedFree(&hip_gen_template_source)); + CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor)); + *is_all_tensor = *is_all_tensor && is_field_tensor; + } + CeedCallBackend(CeedBasisDestroy(&basis)); } - // Get QFunction source and name - string qfunction_source(qf_data->qfunction_source); - string qfunction_name(qf_data->qfunction_name); - string operator_name; - operator_name = "CeedKernelHipGenOperator_" + qfunction_name; + // Find max_P, max_P_1d, Q, and Q_1d + bool is_all_3d = true; - // Find dim, P_1d, Q_1d - data->max_P_1d = 0; + *max_P = 0; + *max_P_1d = 0; + *Q = 0; + *Q_1d = 0; for (CeedInt i = 0; i < num_input_fields; i++) { + CeedBasis basis; + CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); if (basis != CEED_BASIS_NONE) { - bool is_tensor; + bool is_field_tensor; + CeedInt field_dim = 0, field_P = 0, field_P_1d = 0, field_Q = 0, field_Q_1d = 0; - CeedCallBackend(CeedBasisGetData(basis, &basis_data)); - CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); + // Check if 3D + CeedCallBackend(CeedBasisGetDimension(basis, &field_dim)); + is_all_3d = is_all_3d && (field_dim == 3); + *max_dim = CeedIntMax(*max_dim, field_dim); - // Collect dim, P_1d, and Q_1d - CeedCallBackend(CeedBasisGetDimension(basis, &dim)); - CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor)); - CeedCheck(is_tensor, ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis"); - CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); - CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); - if (P_1d > data->max_P_1d) data->max_P_1d = P_1d; + // Collect P, P_1d, Q, and Q_1d + CeedCallBackend(CeedBasisGetNumNodes(basis, &field_P)); + *max_P = CeedIntMax(*max_P, field_P); + CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor)); + if (is_field_tensor) { + CeedCallBackend(CeedBasisGetNumNodes1D(basis, &field_P_1d)); + *max_P_1d = CeedIntMax(*max_P_1d, field_P_1d); + } + CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &field_Q)); + CeedCheck(*Q == 0 || field_Q == *Q, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible"); + *Q = field_Q; + if (is_field_tensor) { + CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &field_Q_1d)); + CeedCheck(*Q_1d == 0 || field_Q_1d == *Q_1d, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible"); + *Q_1d = field_Q_1d; + } } + CeedCallBackend(CeedBasisDestroy(&basis)); } - // Check output bases for Q_1d, dim as well - // The only input basis might be CEED_BASIS_NONE for (CeedInt i = 0; i < num_output_fields; i++) { - CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); + CeedBasis basis; + CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); if (basis != CEED_BASIS_NONE) { - bool is_tensor; + bool is_field_tensor; + CeedInt field_dim = 0, field_P = 0, field_P_1d = 0, field_Q = 0, field_Q_1d = 0; - CeedCallBackend(CeedBasisGetData(basis, &basis_data)); - CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); + // Check if 3D + CeedCallBackend(CeedBasisGetDimension(basis, &field_dim)); + is_all_3d = is_all_3d && (field_dim == 3); + *max_dim = CeedIntMax(*max_dim, field_dim); - // Collect Q_1d - CeedCallBackend(CeedBasisGetDimension(basis, &dim)); - CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor)); - CeedCheck(is_tensor, ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis"); - CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); + // Collect P, P_1d, Q, and Q_1d + CeedCallBackend(CeedBasisGetNumNodes(basis, &field_P)); + *max_P = CeedIntMax(*max_P, field_P); + CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor)); + if (is_field_tensor) { + CeedCallBackend(CeedBasisGetNumNodes1D(basis, &field_P_1d)); + *max_P_1d = CeedIntMax(*max_P_1d, field_P_1d); + } + CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &field_Q)); + CeedCheck(*Q == 0 || field_Q == *Q, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible"); + *Q = field_Q; + if (is_field_tensor) { + CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &field_Q_1d)); + CeedCheck(*Q_1d == 0 || field_Q_1d == *Q_1d, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible"); + *Q_1d = field_Q_1d; + } } + CeedCallBackend(CeedBasisDestroy(&basis)); } - data->dim = dim; - data->Q_1d = Q_1d; // Only use 3D collocated gradient parallelization strategy when gradient is computed - // TODO: put in a function? - bool use_collograd_parallelization = false; - - if (dim == 3) { + *use_3d_slices = false; + if (is_all_3d && *is_all_tensor) { bool was_grad_found = false; for (CeedInt i = 0; i < num_input_fields; i++) { + CeedEvalMode eval_mode; + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); if (eval_mode == CEED_EVAL_GRAD) { + CeedBasis_Hip_shared *basis_data; + CeedBasis basis; + CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); CeedCallBackend(CeedBasisGetData(basis, &basis_data)); - use_collograd_parallelization = basis_data->d_collo_grad_1d && (was_grad_found ? use_collograd_parallelization : true); - was_grad_found = true; + *use_3d_slices = basis_data->d_collo_grad_1d && (was_grad_found ? *use_3d_slices : true); + was_grad_found = true; + CeedCallBackend(CeedBasisDestroy(&basis)); } } for (CeedInt i = 0; i < num_output_fields; i++) { + CeedEvalMode eval_mode; + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); if (eval_mode == CEED_EVAL_GRAD) { + CeedBasis_Hip_shared *basis_data; + CeedBasis basis; + CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); CeedCallBackend(CeedBasisGetData(basis, &basis_data)); - use_collograd_parallelization = basis_data->d_collo_grad_1d && (was_grad_found ? use_collograd_parallelization : true); - was_grad_found = true; + *use_3d_slices = basis_data->d_collo_grad_1d && (was_grad_found ? *use_3d_slices : true); + was_grad_found = true; + CeedCallBackend(CeedBasisDestroy(&basis)); } } } + return CEED_ERROR_SUCCESS; +} - // Define CEED_Q_VLA - code << "\n#undef CEED_Q_VLA\n"; - if (dim != 3 || use_collograd_parallelization) { - code << "#define CEED_Q_VLA 1\n\n"; - } else { - code << "#define CEED_Q_VLA " << Q_1d << "\n\n"; - } +//------------------------------------------------------------------------------ +// Setup fields +//------------------------------------------------------------------------------ +static int CeedOperatorBuildKernelFieldData_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, Tab &tab, CeedInt i, + CeedOperatorField op_field, CeedQFunctionField qf_field, FieldReuse_Hip field_reuse, + CeedInt max_dim, CeedInt Q, CeedInt Q_1d, bool is_input, bool is_all_tensor, bool is_at_points, + bool use_3d_slices, bool skip_active_load) { + bool is_tensor = true, is_active = true; + CeedBasis basis; - code << qfunction_source; + CeedCallBackend(CeedOperatorFieldGetBasis(op_field, &basis)); + if (basis != CEED_BASIS_NONE) CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor)); + { + CeedVector vec; - // Setup - code << "\n// -----------------------------------------------------------------------------\n"; - code << "\nextern \"C\" __launch_bounds__(BLOCK_SIZE)\n"; - code << "__global__ void " << operator_name - << "(CeedInt num_elem, void* ctx, FieldsInt_Hip indices, Fields_Hip fields, Fields_Hip B, Fields_Hip G, CeedScalar* W) {\n"; - for (CeedInt i = 0; i < num_input_fields; i++) { - CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); - if (eval_mode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT - code << " const CeedScalar* d_u_" << i << " = fields.inputs[" << i << "];\n"; - } + CeedCallBackend(CeedOperatorFieldGetVector(op_field, &vec)); + is_active = vec == CEED_VECTOR_ACTIVE; + CeedCallBackend(CeedVectorDestroy(&vec)); } - for (CeedInt i = 0; i < num_output_fields; i++) { - code << " CeedScalar* d_v_" << i << " = fields.outputs[" << i << "];\n"; - } + const char *field_name; + std::string var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i); + std::string P_name = (is_tensor ? "P_1d" : "P") + var_suffix, Q_name = is_tensor ? "Q_1d" : "Q"; + std::string option_name = (is_input ? "inputs" : "outputs"); + CeedEvalMode eval_mode = CEED_EVAL_NONE; + CeedInt elem_size = 0, num_comp = 0, dim = max_dim, P_1d = 0; + CeedElemRestriction elem_rstr; + CeedBasis_Hip_shared *basis_data; - code << " const CeedInt dim = " << dim << ";\n"; - code << " const CeedInt Q_1d = " << Q_1d << ";\n"; + // Field reuse info + bool use_previous_field = field_reuse.index != -1; - code << " HIP_DYNAMIC_SHARED( CeedScalar, slice)\n"; - // TODO put in a function? InitSharedData_Hip? - code << " SharedData_Hip data;\n"; - code << " data.t_id_x = threadIdx.x;\n"; - code << " data.t_id_y = threadIdx.y;\n"; - code << " data.t_id_z = threadIdx.z;\n"; - code << " data.t_id = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n"; - code << " data.slice = slice+data.t_id_z*T_1D" << (dim > 1 ? "*T_1D" : "") << ";\n"; + CeedCallBackend(CeedOperatorFieldGetName(op_field, &field_name)); + code << tab << "// -- " << (is_input ? "Input" : "Output") << " field " << i << ": " << field_name << "\n"; - code << "\n // -- Input field constants and basis data --\n"; - // TODO: Put in a function? - // Initialize constants, and matrices B and G - for (CeedInt i = 0; i < num_input_fields; i++) { - code << " // ---- Input field " << i << " ----\n"; - // Get elem_size, eval_mode, num_comp - CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr)); + // Get field data + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr)); + if (elem_rstr != CEED_ELEMRESTRICTION_NONE) { CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); - CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); + } + CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); + if (basis != CEED_BASIS_NONE) { + CeedCallBackend(CeedBasisGetData(basis, &basis_data)); + CeedCallBackend(CeedBasisGetDimension(basis, &dim)); + if (is_tensor) CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); + else CeedCallBackend(CeedBasisGetNumNodes(basis, &P_1d)); + } + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode)); - // Set field constants - if (eval_mode != CEED_EVAL_WEIGHT) { - CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); - if (basis != CEED_BASIS_NONE) { - CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); - code << " const CeedInt P_in_" << i << " = " << P_1d << ";\n"; + // Set field constants + code << tab << "const CeedInt dim" << var_suffix << " = " << dim << ";\n"; + if (is_tensor && !is_all_tensor) { + CeedInt P = 0; + + CeedCallBackend(CeedBasisGetNumNodes(basis, &P)); + code << tab << "const CeedInt P" << var_suffix << " = " << (basis == CEED_BASIS_NONE ? Q : P) << ";\n"; + } + code << tab << "const CeedInt " << P_name << " = " << (basis == CEED_BASIS_NONE ? Q_1d : P_1d) << ";\n"; + if (eval_mode != CEED_EVAL_WEIGHT) { + code << tab << "const CeedInt num_comp" << var_suffix << " = " << num_comp << ";\n"; + } + + // Load basis data + code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n"; + switch (eval_mode) { + case CEED_EVAL_NONE: + break; + case CEED_EVAL_INTERP: + if (is_at_points) { + // AtPoints + if (!basis_data->d_chebyshev_interp_1d) { + CeedSize interp_bytes; + CeedScalar *chebyshev_interp_1d; + + interp_bytes = P_1d * Q_1d * sizeof(CeedScalar); + CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d)); + CeedCallBackend(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d)); + CeedCallHip(CeedBasisReturnCeed(basis), hipMalloc((void **)&basis_data->d_chebyshev_interp_1d, interp_bytes)); + CeedCallHip(CeedBasisReturnCeed(basis), + hipMemcpy(basis_data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, hipMemcpyHostToDevice)); + CeedCallBackend(CeedFree(&chebyshev_interp_1d)); + } + if (is_input) data->B.inputs[i] = basis_data->d_chebyshev_interp_1d; + else data->B.outputs[i] = basis_data->d_chebyshev_interp_1d; } else { - code << " const CeedInt P_in_" << i << " = " << Q_1d << ";\n"; + // Standard quadrature + if (is_input) data->B.inputs[i] = basis_data->d_interp_1d; + else data->B.outputs[i] = basis_data->d_interp_1d; } - code << " const CeedInt num_comp_in_" << i << " = " << num_comp << ";\n"; - } + if (use_previous_field && !skip_active_load) { + std::string reuse_var = "s_B" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index)); - // Load basis data - code << " // EvalMode: " << CeedEvalModes[eval_mode] << "\n"; - switch (eval_mode) { - case CEED_EVAL_NONE: - break; - case CEED_EVAL_INTERP: - CeedCallBackend(CeedBasisGetData(basis, &basis_data)); - data->B.inputs[i] = basis_data->d_interp_1d; - code << " __shared__ CeedScalar s_B_in_" << i << "[" << P_1d * Q_1d << "];\n"; - code << " loadMatrix(data, B.inputs[" << i << "], s_B_in_" << i << ");\n"; - break; - case CEED_EVAL_GRAD: - CeedCallBackend(CeedBasisGetData(basis, &basis_data)); - data->B.inputs[i] = basis_data->d_interp_1d; - code << " __shared__ CeedScalar s_B_in_" << i << "[" << P_1d * Q_1d << "];\n"; - code << " loadMatrix(data, B.inputs[" << i << "], s_B_in_" << i << ");\n"; - if (use_collograd_parallelization) { - data->G.inputs[i] = basis_data->d_collo_grad_1d; - code << " __shared__ CeedScalar s_G_in_" << i << "[" << Q_1d * Q_1d << "];\n"; - code << " loadMatrix(data, G.inputs[" << i << "], s_G_in_" << i << ");\n"; + code << tab << "CeedScalar *s_B" << var_suffix << " = " << reuse_var << ";\n"; + } else { + bool is_collocated = false; + + CeedCallBackend(CeedBasisIsCollocated(basis, &is_collocated)); + if ((is_active && skip_active_load) || (is_collocated && !is_at_points)) { + code << tab << "CeedScalar *s_B" << var_suffix << " = NULL;\n"; } else { - bool has_collo_grad = basis_data->d_collo_grad_1d; - data->G.inputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d; - code << " __shared__ CeedScalar s_G_in_" << i << "[" << Q_1d * (has_collo_grad ? Q_1d : P_1d) << "];\n"; - code << " loadMatrix<" << (has_collo_grad ? "Q_1d" : ("P_in_" + std::to_string(i))) << ",Q_1d>(data, G.inputs[" << i << "], s_G_in_" << i - << ");\n"; + code << tab << "__shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n"; + code << tab << "LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n"; } - break; - case CEED_EVAL_WEIGHT: - break; // No action - case CEED_EVAL_DIV: - break; // TODO: Not implemented - case CEED_EVAL_CURL: - break; // TODO: Not implemented - } - } - - code << "\n // -- Output field constants and basis data --\n"; - for (CeedInt i = 0; i < num_output_fields; i++) { - code << " // ---- Output field " << i << " ----\n"; - // Get elem_size, eval_mode, num_comp - CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr)); - CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); - CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); - CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); + } + break; + case CEED_EVAL_GRAD: + if (is_at_points) { + // AtPoints + if (!basis_data->d_chebyshev_interp_1d) { + CeedSize interp_bytes; + CeedScalar *chebyshev_interp_1d; - // Set field constants - CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); - if (basis != CEED_BASIS_NONE) { - CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); - code << " const CeedInt P_out_" << i << " = " << P_1d << ";\n"; - } else { - code << " const CeedInt P_out_" << i << " = " << Q_1d << ";\n"; - } - code << " const CeedInt num_comp_out_" << i << " = " << num_comp << ";\n"; + interp_bytes = P_1d * Q_1d * sizeof(CeedScalar); + CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d)); + CeedCallBackend(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d)); + CeedCallHip(CeedBasisReturnCeed(basis), hipMalloc((void **)&basis_data->d_chebyshev_interp_1d, interp_bytes)); + CeedCallHip(CeedBasisReturnCeed(basis), + hipMemcpy(basis_data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, hipMemcpyHostToDevice)); + CeedCallBackend(CeedFree(&chebyshev_interp_1d)); + } + if (is_input) data->B.inputs[i] = basis_data->d_chebyshev_interp_1d; + else data->B.outputs[i] = basis_data->d_chebyshev_interp_1d; + } else { + // Standard quadrature + if (is_input) data->B.inputs[i] = basis_data->d_interp_1d; + else data->B.outputs[i] = basis_data->d_interp_1d; + } + if (is_tensor) { + if (use_previous_field && !skip_active_load) { + std::string reuse_var = "s_B" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index)); - // Load basis data - code << " // EvalMode: " << CeedEvalModes[eval_mode] << "\n"; - switch (eval_mode) { - case CEED_EVAL_NONE: - break; // No action - case CEED_EVAL_INTERP: - CeedCallBackend(CeedBasisGetData(basis, &basis_data)); - data->B.outputs[i] = basis_data->d_interp_1d; - code << " __shared__ CeedScalar s_B_out_" << i << "[" << P_1d * Q_1d << "];\n"; - code << " loadMatrix(data, B.outputs[" << i << "], s_B_out_" << i << ");\n"; - break; - case CEED_EVAL_GRAD: - CeedCallBackend(CeedBasisGetData(basis, &basis_data)); - data->B.outputs[i] = basis_data->d_interp_1d; - code << " __shared__ CeedScalar s_B_out_" << i << "[" << P_1d * Q_1d << "];\n"; - code << " loadMatrix(data, B.outputs[" << i << "], s_B_out_" << i << ");\n"; - if (use_collograd_parallelization) { - data->G.outputs[i] = basis_data->d_collo_grad_1d; - code << " __shared__ CeedScalar s_G_out_" << i << "[" << Q_1d * Q_1d << "];\n"; - code << " loadMatrix(data, G.outputs[" << i << "], s_G_out_" << i << ");\n"; + code << tab << "CeedScalar *s_B" << var_suffix << " = " << reuse_var << ";\n"; } else { - bool has_collo_grad = basis_data->d_collo_grad_1d; - data->G.outputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d; - code << " __shared__ CeedScalar s_G_out_" << i << "[" << Q_1d * (has_collo_grad ? Q_1d : P_1d) << "];\n"; - code << " loadMatrix<" << (has_collo_grad ? "Q_1d" : ("P_out_" + std::to_string(i))) << ",Q_1d>(data, G.outputs[" << i << "], s_G_out_" - << i << ");\n"; + bool is_collocated = false; + + CeedCallBackend(CeedBasisIsCollocated(basis, &is_collocated)); + if ((is_active && skip_active_load) || (is_collocated && !is_at_points)) { + code << tab << "CeedScalar *s_B" << var_suffix << " = NULL;\n"; + } else { + code << tab << "__shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n"; + code << tab << "LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n"; + } } - break; - // LCOV_EXCL_START - case CEED_EVAL_WEIGHT: { - return CeedError(CeedOperatorReturnCeed(op), CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); - break; // Should not occur } - case CEED_EVAL_DIV: - case CEED_EVAL_CURL: { - return CeedError(CeedOperatorReturnCeed(op), CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]); - break; // Should not occur + if (is_at_points) break; // No G mat for AtPoints + if (use_3d_slices) { + if (is_input) data->G.inputs[i] = basis_data->d_collo_grad_1d; + else data->G.outputs[i] = basis_data->d_collo_grad_1d; + if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD && !skip_active_load) { + std::string reuse_var = "s_G" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index)); + + code << tab << "CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n"; + } else if (is_active && skip_active_load) { + code << tab << "CeedScalar *s_G" << var_suffix << " = NULL;\n"; + } else { + code << tab << "__shared__ CeedScalar s_G" << var_suffix << "[" << Q_name << "*" << Q_name << "];\n"; + code << tab << "LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n"; + } + } else { + bool has_collo_grad = basis_data->d_collo_grad_1d; + + if (is_input) data->G.inputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d; + else data->G.outputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d; + if (has_collo_grad) { + if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD && !skip_active_load) { + std::string reuse_var = "s_G" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index)); + + code << tab << "CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n"; + } else if (is_active && skip_active_load) { + code << tab << "CeedScalar *s_G" << var_suffix << " = NULL;\n"; + } else { + code << tab << "__shared__ CeedScalar s_G" << var_suffix << "[" << Q_name << "*" << Q_name << "];\n"; + code << tab << "LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n"; + } + } else { + if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD && !skip_active_load) { + std::string reuse_var = "s_G" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index)); + + code << tab << "CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n"; + } else if (is_active && skip_active_load) { + code << tab << "CeedScalar *s_G" << var_suffix << " = NULL;\n"; + } else { + code << tab << "__shared__ CeedScalar s_G" << var_suffix << "[" << P_name << "*" << Q_name << (is_tensor ? "" : "*dim") + << (is_tensor ? "" : var_suffix) << "];\n"; + code << tab << "LoadMatrix<" << P_name << ", " << Q_name << (is_tensor ? "" : "*dim") << (is_tensor ? "" : var_suffix) << ">(data, G." + << option_name << "[" << i << "], s_G" << var_suffix << ");\n"; + } + } } - // LCOV_EXCL_STOP - } + break; + case CEED_EVAL_WEIGHT: + break; // No action + // LCOV_EXCL_START + case CEED_EVAL_DIV: + case CEED_EVAL_CURL: + break; // TODO: Not implemented + // LCOV_EXCL_STOP } - code << "\n // -- Element loop --\n"; - code << " __syncthreads();\n"; - code << " for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {\n"; - // Input basis apply if needed - // Generate the correct eval mode code for each input - code << " // -- Input field restrictions and basis actions --\n"; - for (CeedInt i = 0; i < num_input_fields; i++) { - code << " // ---- Input field " << i << " ----\n"; - // Get elem_size, eval_mode, num_comp - CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr)); + CeedCallBackend(CeedBasisDestroy(&basis)); + return CEED_ERROR_SUCCESS; +} + +//------------------------------------------------------------------------------ +// Restriction +//------------------------------------------------------------------------------ +static int CeedOperatorBuildKernelRestriction_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, Tab &tab, CeedInt i, + CeedInt field_input_buffer[], CeedOperatorField op_field, CeedQFunctionField qf_field, + CeedInt max_dim, CeedInt Q_1d, bool is_input, bool is_all_tensor, bool is_at_points, + bool use_3d_slices) { + std::string var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i); + std::string P_name = (is_all_tensor ? "P_1d" : "P") + var_suffix; + CeedEvalMode eval_mode = CEED_EVAL_NONE; + CeedInt elem_size = 0, num_comp = 0; + CeedSize l_size; + CeedRestrictionType rstr_type = CEED_RESTRICTION_STANDARD; + CeedElemRestriction_Hip *rstr_data; + CeedElemRestriction elem_rstr; + + // Get field data + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr)); + if (elem_rstr != CEED_ELEMRESTRICTION_NONE) { + CeedCallBackend(CeedElemRestrictionGetType(elem_rstr, &rstr_type)); CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); - CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); + CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_data)); + } + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode)); - // Restriction - if (eval_mode != CEED_EVAL_WEIGHT && !((eval_mode == CEED_EVAL_NONE) && use_collograd_parallelization)) { - bool is_strided; + // Restriction + if (is_input) { + // Input + if (field_input_buffer[i] != i) { + std::string buffer_name = "r_e_in_" + std::to_string(field_input_buffer[i]); - code << " CeedScalar r_u_" << i << "[num_comp_in_" << i << "*P_in_" << i << "];\n"; + // Restriction was already done for previous input + code << tab << "CeedScalar *r_e" << var_suffix << " = " << buffer_name << ";\n"; + } else if (eval_mode != CEED_EVAL_WEIGHT && !((eval_mode == CEED_EVAL_NONE) && use_3d_slices && is_at_points)) { + if (eval_mode == CEED_EVAL_NONE && rstr_type != CEED_RESTRICTION_POINTS) { + // No basis action, so r_e_in_* in also r_q_in_* and needs to be allocated + code << tab << "CeedScalar r_e" << var_suffix << "[num_comp" << var_suffix << "*" << P_name << "];\n"; + } else if (rstr_type != CEED_RESTRICTION_POINTS) { + // Otherwise we're using the scratch space + code << tab << "CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n"; + } + switch (rstr_type) { + case CEED_RESTRICTION_STANDARD: { + CeedInt comp_stride; - CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided)); - if (!is_strided) { - CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size)); - code << " const CeedInt l_size_in_" << i << " = " << l_size << ";\n"; + CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size)); + code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n"; + CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride)); + code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n"; + data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets; + code << tab << "ReadLVecStandard" << (is_all_tensor ? max_dim : 1) << "d(data, l_size" << var_suffix << ", elem, indices.inputs[" << i << "], d" << var_suffix << ", r_e" << var_suffix + << ");\n"; + break; + } + case CEED_RESTRICTION_STRIDED: { + bool has_backend_strides; + CeedInt num_elem; + + CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides)); + CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem)); + CeedInt strides[3] = {1, elem_size * num_elem, elem_size}; + + if (!has_backend_strides) { + CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides)); + } + code << tab << "const CeedInt strides" << var_suffix << "_0 = " << strides[0] << ", strides" << var_suffix << "_1 = " << strides[1] + << ", strides" << var_suffix << "_2 = " << strides[2] << ";\n"; + code << tab << "ReadLVecStrided" << (is_all_tensor ? max_dim : 1) << "d(data, elem, d" << var_suffix << ", r_e" + << var_suffix << ");\n"; + break; + } + case CEED_RESTRICTION_POINTS: { + CeedInt comp_stride; + + CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride)); + code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n"; + data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets; + break; + } + // LCOV_EXCL_START + case CEED_RESTRICTION_ORIENTED: + case CEED_RESTRICTION_CURL_ORIENTED: + break; // TODO: Not implemented + // LCOV_EXCL_STOP + } + } + } else { + // Output + switch (rstr_type) { + case CEED_RESTRICTION_STANDARD: { CeedInt comp_stride; + + CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size)); + code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n"; CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride)); - code << " // CompStride: " << comp_stride << "\n"; - CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_data)); - data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets; - code << " readDofsOffset" << dim << "d(data, l_size_in_" << i - << ", elem, indices.inputs[" << i << "], d_u_" << i << ", r_u_" << i << ");\n"; - } else { + code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n"; + data->indices.outputs[i] = (CeedInt *)rstr_data->d_offsets; + code << tab << "WriteLVecStandard" << (is_all_tensor ? max_dim : 1) << "d(data, l_size" << var_suffix << ", elem, indices.outputs[" << i << "], r_e" << var_suffix << ", d" << var_suffix + << ");\n"; + break; + } + case CEED_RESTRICTION_STRIDED: { bool has_backend_strides; CeedInt num_elem; @@ -395,334 +521,2202 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op) { if (!has_backend_strides) { CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides)); } - code << " // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n"; - code << " readDofsStrided" << dim << "d(data, elem, d_u_" << i << ", r_u_" << i << ");\n"; + code << tab << "const CeedInt strides" << var_suffix << "_0 = " << strides[0] << ", strides" << var_suffix << "_1 = " << strides[1] + << ", strides" << var_suffix << "_2 = " << strides[2] << ";\n"; + code << tab << "WriteLVecStrided" << (is_all_tensor ? max_dim : 1) << "d(data, elem, r_e" << var_suffix << ", d" << var_suffix + << ");\n"; + break; } + case CEED_RESTRICTION_POINTS: + data->indices.outputs[i] = (CeedInt *)rstr_data->d_offsets; + break; + // LCOV_EXCL_START + case CEED_RESTRICTION_ORIENTED: + case CEED_RESTRICTION_CURL_ORIENTED: + break; // TODO: Not implemented + // LCOV_EXCL_STOP } + } + CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); + return CEED_ERROR_SUCCESS; +} + +//------------------------------------------------------------------------------ +// Basis +//------------------------------------------------------------------------------ +static int CeedOperatorBuildKernelBasis_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, Tab &tab, CeedInt i, CeedOperatorField op_field, + CeedQFunctionField qf_field, CeedInt max_dim, CeedInt Q_1d, bool is_input, bool is_all_tensor, + bool is_at_points, bool use_3d_slices) { + bool is_tensor = true, is_collocated = true; + CeedBasis basis; + CeedCallBackend(CeedOperatorFieldGetBasis(op_field, &basis)); + CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor)); + CeedCallBackend(CeedBasisIsCollocated(basis, &is_collocated)); - // TODO: put in a function? - // Basis action - code << " // EvalMode: " << CeedEvalModes[eval_mode] << "\n"; + std::string var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i); + std::string P_name = (is_tensor ? "P_1d" : "P") + var_suffix, Q_name = is_tensor ? "Q_1d" : "Q"; + CeedEvalMode eval_mode = CEED_EVAL_NONE; + CeedInt dim = max_dim, elem_size = 0, num_comp = 0, P_1d = 0; + CeedElemRestriction elem_rstr; + + // Get field data + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr)); + if (elem_rstr != CEED_ELEMRESTRICTION_NONE) { + CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); + CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); + } + CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); + if (basis != CEED_BASIS_NONE) { + CeedCallBackend(CeedBasisGetDimension(basis, &dim)); + if (is_tensor) CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); + else CeedCallBackend(CeedBasisGetNumNodes(basis, &P_1d)); + } + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode)); + + // Basis + code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n"; + if (is_input) { switch (eval_mode) { case CEED_EVAL_NONE: - if (!use_collograd_parallelization) { - code << " CeedScalar* r_t_" << i << " = r_u_" << i << ";\n"; + if (!use_3d_slices && !is_at_points) { + code << tab << "CeedScalar *r_q" << var_suffix << " = r_e" << var_suffix << ";\n"; } break; case CEED_EVAL_INTERP: - code << " CeedScalar r_t_" << i << "[num_comp_in_" << i << "*Q_1d];\n"; - code << " Interp" << (dim > 1 ? "Tensor" : "") << dim << "d(data, r_u_" << i << ", s_B_in_" - << i << ", r_t_" << i << ");\n"; + if (is_at_points) { + std::string function_name = (dim == 1 ? "Interp" : "InterpTensor") + std::to_string(dim) + "d"; + + code << tab << "CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim >= 3 ? Q_name : "1") << "];\n"; + code << tab << function_name << "(data, r_e" << var_suffix + << ", s_B" << var_suffix << ", r_c" << var_suffix << ");\n"; + } else { + std::string function_name = is_tensor ? ((dim == 1 ? "Interp" : "InterpTensor") + std::string(is_collocated ? "CollocatedNodes" : "") + + std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened")) + : "InterpNonTensor"; + std::string op_t_1d_name = (is_all_tensor || !is_tensor) ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name); + + code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_all_tensor && (dim >= 3) ? Q_name : "1") << "];\n"; + code << tab << function_name << "(data, r_e" + << var_suffix << ", s_B" << var_suffix << ", r_q" << var_suffix << ");\n"; + } break; case CEED_EVAL_GRAD: - if (use_collograd_parallelization) { - code << " CeedScalar r_t_" << i << "[num_comp_in_" << i << "*Q_1d];\n"; - code << " Interp" << (dim > 1 ? "Tensor" : "") << dim << "d(data, r_u_" << i - << ", s_B_in_" << i << ", r_t_" << i << ");\n"; + if (is_at_points) { + std::string function_name = (dim == 1 ? "Interp" : "InterpTensor") + std::to_string(dim) + "d"; + + code << tab << "CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim >= 3 ? Q_name : "1") << "];\n"; + code << tab << function_name << "(data, r_e" << var_suffix + << ", s_B" << var_suffix << ", r_c" << var_suffix << ");\n"; + } else if (use_3d_slices) { + std::string function_name = + (dim > 1 ? "InterpTensor" : "Interp") + std::string(is_collocated ? "CollocatedNodes" : "") + std::to_string(dim) + "d"; + + code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n"; + code << tab << function_name << "(data, r_e" << var_suffix + << ", s_B" << var_suffix << ", r_q" << var_suffix << ");\n"; + } else if (is_tensor) { + bool is_collocated_grad = dim == 3 && Q_1d >= P_1d; + std::string function_name = + (dim == 1 ? "Grad" : ("GradTensor" + std::string(is_collocated ? "CollocatedNodes" : (is_collocated_grad ? "Collocated" : "")))) + + std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened"); + std::string op_t_1d_name = is_all_tensor ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name); + + code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "*" + << (is_all_tensor && dim >= 3 ? Q_name : "1") << "];\n"; + code << tab << function_name << "(data, r_e" + << var_suffix << ", s_B" << var_suffix << ", s_G" << var_suffix << ", r_q" << var_suffix << ");\n"; } else { - CeedInt P_1d; - CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); - CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); - code << " CeedScalar r_t_" << i << "[num_comp_in_" << i << "*dim*Q_1d];\n"; - code << " Grad" << (dim > 1 ? "Tensor" : "") << (dim == 3 && Q_1d >= P_1d ? "Collocated" : "") << dim << "d(data, r_u_" << i << ", s_B_in_" << i << ", s_G_in_" << i << ", r_t_" << i << ");\n"; + std::string function_name = "GradNonTensor"; + + code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n"; + code << tab << function_name << "(data, r_e" << var_suffix << ", s_G" << var_suffix << ", r_q" << var_suffix << ");\n"; } break; - case CEED_EVAL_WEIGHT: - code << " CeedScalar r_t_" << i << "[Q_1d];\n"; - CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); - CeedCallBackend(CeedBasisGetData(basis, &basis_data)); - data->W = basis_data->d_q_weight_1d; - code << " Weight" << (dim > 1 ? "Tensor" : "") << dim << "d(data, W, r_t_" << i << ");\n"; - break; // No action + case CEED_EVAL_WEIGHT: { + if (is_at_points) { + code << tab << "// Nothing to do AtPoints\n"; + } else { + CeedBasis_Hip_shared *basis_data; + std::string function_name = is_tensor + ? ((dim == 1 ? "Weight" : "WeightTensor") + std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened")) + : "WeightNonTensor"; + + code << tab << "CeedScalar r_q" << var_suffix << "[" << (is_all_tensor && (dim >= 3) ? Q_name : "1") << "];\n"; + CeedCallBackend(CeedBasisGetData(basis, &basis_data)); + data->W = basis_data->d_q_weight_1d; + code << tab << function_name << "<" << P_name << ", " << Q_name << ">(data, W, r_q" << var_suffix << ");\n"; + } + break; + } + // LCOV_EXCL_START case CEED_EVAL_DIV: - break; // TODO: Not implemented case CEED_EVAL_CURL: break; // TODO: Not implemented + // LCOV_EXCL_STOP } - } + } else { + switch (eval_mode) { + case CEED_EVAL_NONE: + code << tab << "CeedScalar *r_e" << var_suffix << " = r_q" << var_suffix << ";\n"; + break; // No action + case CEED_EVAL_INTERP: + code << tab << "CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n"; + if (is_at_points) { + std::string function_name = (dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d"; - // TODO: put in a function + separate collograd logic - // Q function - code << "\n // -- Output field setup --\n"; - for (CeedInt i = 0; i < num_output_fields; i++) { - code << "\n // ---- Output field " << i << " ----\n"; - CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); - if (eval_mode == CEED_EVAL_GRAD) { - if (use_collograd_parallelization) { - // Accumulator for gradient slices - code << " CeedScalar r_tt_" << i << "[num_comp_out_" << i << "*Q_1d];\n"; - code << " for (CeedInt i = 0; i < num_comp_out_" << i << "; i++) {\n"; - code << " for (CeedInt j = 0; j < Q_1d; ++j) {\n"; - code << " r_tt_" << i << "[j + i*Q_1d] = 0.0;\n"; - code << " }\n"; - code << " }\n"; - } else { - code << " CeedScalar r_tt_" << i << "[num_comp_out_" << i << "*dim*Q_1d];\n"; - } - } - if (eval_mode == CEED_EVAL_NONE || eval_mode == CEED_EVAL_INTERP) { - code << " CeedScalar r_tt_" << i << "[num_comp_out_" << i << "*Q_1d];\n"; - } - } - // We treat quadrature points per slice in 3d to save registers - if (use_collograd_parallelization) { - code << "\n // Note: Using planes of 3D elements\n"; - code << "#pragma unroll\n"; - code << " for (CeedInt q = 0; q < Q_1d; q++) {\n"; - code << " // -- Input fields --\n"; - for (CeedInt i = 0; i < num_input_fields; i++) { - code << " // ---- Input field " << i << " ----\n"; - // Get elem_size, eval_mode, num_comp - CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); - // Basis action - code << " // EvalMode: " << CeedEvalModes[eval_mode] << "\n"; - switch (eval_mode) { - case CEED_EVAL_NONE: - bool is_strided; + code << tab << function_name << "(data, r_c" << var_suffix + << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n"; + } else { + std::string function_name = + is_tensor ? ((dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::string(is_collocated ? "CollocatedNodes" : "") + + std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened")) + : "InterpTransposeNonTensor"; + std::string op_t_1d_name = (is_all_tensor || !is_tensor) ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name); - code << " CeedScalar r_q_" << i << "[num_comp_in_" << i << "];\n"; + code << tab << function_name << "(data, r_q" + << var_suffix << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n"; + } + break; + case CEED_EVAL_GRAD: + code << tab << "CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n"; + if (is_at_points) { + std::string function_name = (dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d"; - CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr)); - CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided)); - if (!is_strided) { - CeedInt comp_stride; + code << tab << function_name << "(data, r_c" << var_suffix + << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n"; + } else if (use_3d_slices) { + std::string function_name = (dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::string(is_collocated ? "CollocatedNodes" : "") + + std::to_string(dim) + "d"; - CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size)); - code << " const CeedInt l_size_in_" << i << " = " << l_size << ";\n"; - CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride)); - code << " // CompStride: " << comp_stride << "\n"; - CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_data)); - data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets; - code << " readSliceQuadsOffset" - << "3d(data, l_size_in_" << i << ", elem, q, indices.inputs[" << i << "], d_u_" - << i << ", r_q_" << i << ");\n"; - } else { - bool has_backend_strides; - CeedInt num_elem; + code << tab << function_name << "(data, r_q" << var_suffix + << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n"; + } else if (is_tensor) { + bool is_collocated_grad = dim == 3 && Q_1d >= P_1d; + std::string function_name = + (dim == 1 ? "GradTranspose" + : ("GradTransposeTensor" + std::string(is_collocated ? "CollocatedNodes" : (is_collocated_grad ? "Collocated" : "")))) + + std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened"); + std::string op_t_1d_name = is_all_tensor ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name); - CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); - CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides)); - CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem)); - CeedInt strides[3] = {1, elem_size * num_elem, elem_size}; + code << tab << function_name << "(data, r_q" + << var_suffix << ", s_B" << var_suffix << ", s_G" << var_suffix << ", r_e" << var_suffix << ");\n"; + } else { + std::string function_name = "GradTransposeNonTensor"; - if (!has_backend_strides) { - CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides)); - } - code << " // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n"; - code << " readSliceQuadsStrided" - << "3d(data, elem, q, d_u_" << i << ", r_q_" << i << ");\n"; - } - break; - case CEED_EVAL_INTERP: - code << " CeedScalar r_q_" << i << "[num_comp_in_" << i << "];\n"; - code << " for (CeedInt j = 0; j < num_comp_in_" << i << " ; ++j) {\n"; - code << " r_q_" << i << "[j] = r_t_" << i << "[q + j*Q_1d];\n"; - code << " }\n"; - break; - case CEED_EVAL_GRAD: - code << " CeedScalar r_q_" << i << "[num_comp_in_" << i << "*dim];\n"; - code << " gradCollo3d(data, q, r_t_" << i << ", s_G_in_" << i << ", r_q_" << i << ");\n"; - break; - case CEED_EVAL_WEIGHT: - code << " CeedScalar r_q_" << i << "[1];\n"; - code << " r_q_" << i << "[0] = r_t_" << i << "[q];\n"; - break; // No action + code << tab << function_name << "(data, r_q" << var_suffix << ", s_G" << var_suffix << ", r_e" << var_suffix << ");\n"; + } + break; + // LCOV_EXCL_START + case CEED_EVAL_WEIGHT: + break; // Should not occur + case CEED_EVAL_DIV: + case CEED_EVAL_CURL: + break; // TODO: Not implemented + // LCOV_EXCL_STOP + } + } + CeedCallBackend(CeedBasisDestroy(&basis)); + return CEED_ERROR_SUCCESS; +} + +//------------------------------------------------------------------------------ +// QFunction +//------------------------------------------------------------------------------ +static int CeedOperatorBuildKernelQFunction_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, Tab &tab, CeedInt max_dim, + CeedInt max_num_points, CeedInt num_input_fields, CeedOperatorField *op_input_fields, + CeedQFunctionField *qf_input_fields, CeedInt num_output_fields, + CeedOperatorField *op_output_fields, CeedQFunctionField *qf_output_fields, + std::string qfunction_name, CeedInt Q_1d, bool is_all_tensor, bool is_at_points, + bool use_3d_slices, bool is_assemble) { + std::string Q_name = is_all_tensor ? "Q_1d" : "Q"; + CeedEvalMode eval_mode = CEED_EVAL_NONE; + CeedElemRestriction elem_rstr; + + // Setup output arrays + code << "\n"; + code << tab << "// -- Output field setup\n"; + for (CeedInt i = 0; i < num_output_fields; i++) { + const char *field_name; + std::string var_suffix = "_out_" + std::to_string(i); + + CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name)); + code << tab << "// ---- Output field " << i << ": " << field_name << "\n"; + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); + switch (eval_mode) { + case CEED_EVAL_NONE: + if (is_at_points) { + code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "];\n"; + } else { + code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_all_tensor && (max_dim >= 3) ? Q_name : "1") + << "];\n"; + } + break; + case CEED_EVAL_INTERP: + if (is_at_points) { + // Accumulator for point data + code << tab << "CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "];\n"; + code << tab << "for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "; i++) r_c" << var_suffix + << "[i] = 0.0;\n"; + } else { + code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_all_tensor && (max_dim >= 3) ? Q_name : "1") + << "];\n"; + } + break; + case CEED_EVAL_GRAD: + if (is_at_points) { + // Accumulator for point data + code << tab << "CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "];\n"; + code << tab << "for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "; i++) r_c" << var_suffix + << "[i] = 0.0;\n"; + } else if (use_3d_slices) { + // Accumulator for gradient slices + code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n"; + code << tab << "for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << Q_name << "; i++) r_q" << var_suffix << "[i] = 0.0;\n"; + } else { + code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "*" + << (is_all_tensor && (max_dim >= 3) ? Q_name : "1") << "];\n"; + } + break; + case CEED_EVAL_WEIGHT: + break; + // LCOV_EXCL_START + case CEED_EVAL_DIV: + case CEED_EVAL_CURL: + break; // TODO: Not implemented + // LCOV_EXCL_STOP + } + } + + if (is_at_points) { + // We need to handle batches of points + code << "\n"; + code << tab << "// Note: Using batches of points\n"; + code << tab << "const CeedInt point_loop_bound = (blockDim.x*blockDim.y) * ceil((1.0*max_num_points) / (blockDim.x*blockDim.y));\n\n"; + code << tab << "#pragma unroll\n"; + code << tab << "for (CeedInt i = threadIdx.x + threadIdx.y*blockDim.x; i < point_loop_bound; i += blockDim.x*blockDim.y) {\n"; + tab.push(); + code << tab << "const CeedInt p = i % max_num_points;\n\n"; + + code << tab << "// -- Coordinates\n"; + code << tab << "CeedScalar r_x[max_dim];\n"; + code << tab << "ReadPoint(data, elem, p, max_num_points, points.indices, points.coords, r_x);\n\n"; + + code << tab << "// -- Input fields\n"; + for (CeedInt i = 0; i < num_input_fields; i++) { + const char *field_name; + std::string var_suffix = "_in_" + std::to_string(i); + std::string P_name = "P_1d" + var_suffix; + + CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name)); + code << tab << "// ---- Input field " << i << ": " << field_name << "\n"; + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); + // Basis action + code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n"; + switch (eval_mode) { + case CEED_EVAL_NONE: + code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n"; + code << tab << "ReadPoint(data, elem, p, max_num_points, indices.inputs[" << i << "], d" << var_suffix << ", r_s" << var_suffix << ");\n"; + break; + case CEED_EVAL_INTERP: + code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n"; + code << tab << "InterpAtPoints" << max_dim << "d(data, i, r_c" << var_suffix << ", r_x, r_s" << var_suffix << ");\n"; + break; + case CEED_EVAL_GRAD: + code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n"; + code << tab << "GradAtPoints" << max_dim << "d(data, i, r_c" << var_suffix << ", r_x, r_s" << var_suffix << ");\n"; + break; + case CEED_EVAL_WEIGHT: + code << tab << "CeedScalar r_s" << var_suffix << "[1];\n"; + code << tab << "r_s" << var_suffix << "[0] = 1.0;\n"; + break; + // LCOV_EXCL_START case CEED_EVAL_DIV: - break; // TODO: Not implemented case CEED_EVAL_CURL: break; // TODO: Not implemented + // LCOV_EXCL_STOP } } - code << "\n // -- Output fields --\n"; + code << "\n"; + code << tab << "// -- Output fields\n"; for (CeedInt i = 0; i < num_output_fields; i++) { - code << " // ---- Output field " << i << " ----\n"; + const char *field_name; + std::string var_suffix = "_out_" + std::to_string(i); + + CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name)); + code << tab << "// ---- Output field " << i << ": " << field_name << "\n"; CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); // Basis action switch (eval_mode) { case CEED_EVAL_NONE: - code << " CeedScalar r_qq_" << i << "[num_comp_out_" << i << "];\n"; - break; // No action + code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n"; + break; case CEED_EVAL_INTERP: - code << " CeedScalar r_qq_" << i << "[num_comp_out_" << i << "];\n"; + code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n"; break; case CEED_EVAL_GRAD: - code << " CeedScalar r_qq_" << i << "[num_comp_out_" << i << "*dim];\n"; + code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n"; break; + // LCOV_EXCL_START case CEED_EVAL_WEIGHT: break; // Should not occur case CEED_EVAL_DIV: + case CEED_EVAL_CURL: + break; // TODO: Not implemented + // LCOV_EXCL_STOP + } + } + + } else if (use_3d_slices) { + // We treat quadrature points per slice in 3d to save registers + code << "\n"; + code << tab << "// Note: Using planes of 3D elements\n"; + code << tab << "#pragma unroll\n"; + code << tab << "for (CeedInt q = 0; q < " << Q_name << "; q++) {\n"; + tab.push(); + code << tab << "// -- Input fields\n"; + for (CeedInt i = 0; i < num_input_fields; i++) { + const char *field_name; + std::string var_suffix = "_in_" + std::to_string(i); + + CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name)); + code << tab << "// ---- Input field " << i << ": " << field_name << "\n"; + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); + // Basis action + code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n"; + switch (eval_mode) { + case CEED_EVAL_NONE: + bool is_strided; + + code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n"; + + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr)); + CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided)); + if (is_strided) { + bool has_backend_strides; + CeedInt num_elem, elem_size; + + CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); + CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides)); + CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem)); + CeedInt strides[3] = {1, elem_size * num_elem, elem_size}; + + if (!has_backend_strides) { + CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides)); + } + code << tab << "const CeedInt strides" << var_suffix << "_0 = " << strides[0] << ", strides" << var_suffix << "_1 = " << strides[1] + << ", strides" << var_suffix << "_2 = " << strides[2] << ";\n"; + code << tab << "ReadEVecSliceStrided3d(data, elem, q, d" << var_suffix << ", r_s" << var_suffix << ");\n"; + } else { + CeedSize l_size = 0; + CeedInt comp_stride; + CeedElemRestriction_Hip *rstr_data; + + CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size)); + code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n"; + CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride)); + code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n"; + CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_data)); + data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets; + code << tab << "ReadEVecSliceStandard3d(data, l_size" + << var_suffix << ", elem, q, indices.inputs[" << i << "], d" << var_suffix << ", r_s" << var_suffix << ");\n"; + } + CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); + break; + case CEED_EVAL_INTERP: + code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n"; + code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << "; j++) {\n"; + tab.push(); + code << tab << "r_s" << var_suffix << "[j] = r_q" << var_suffix << "[q + j*" << Q_name << "];\n"; + tab.pop(); + code << tab << "}\n"; + break; + case CEED_EVAL_GRAD: + code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n"; + code << tab << "GradColloSlice3d(data, q, r_q" << var_suffix << ", s_G" + << var_suffix << ", r_s" << var_suffix << ");\n"; + break; + case CEED_EVAL_WEIGHT: + code << tab << "CeedScalar r_s" << var_suffix << "[1];\n"; + code << tab << "r_s" << var_suffix << "[0] = r_q" << var_suffix << "[q];\n"; + break; + // LCOV_EXCL_START + case CEED_EVAL_DIV: + case CEED_EVAL_CURL: break; // TODO: Not implemented + // LCOV_EXCL_STOP + } + } + code << "\n"; + code << tab << "// -- Output fields\n"; + for (CeedInt i = 0; i < num_output_fields; i++) { + const char *field_name; + std::string var_suffix = "_out_" + std::to_string(i); + + CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name)); + code << tab << "// ---- Output field " << i << ": " << field_name << "\n"; + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); + // Basis action + switch (eval_mode) { + case CEED_EVAL_NONE: + code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n"; + break; + case CEED_EVAL_INTERP: + code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n"; + break; + case CEED_EVAL_GRAD: + code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n"; + break; + // LCOV_EXCL_START + case CEED_EVAL_WEIGHT: + break; // Should not occur + case CEED_EVAL_DIV: case CEED_EVAL_CURL: break; // TODO: Not implemented + // LCOV_EXCL_STOP } } } else { - code << "\n // Note: Using full elements\n"; - code << " // -- Input fields --\n"; + code << "\n"; + code << tab << "// Note: Using full elements\n"; + code << tab << "{\n"; + tab.push(); + code << tab << "// -- Input fields\n"; for (CeedInt i = 0; i < num_input_fields; i++) { - code << " // ---- Input field " << i << " ----\n"; - code << " CeedScalar* r_q_" << i << " = r_t_" << i << ";\n"; + const char *field_name; + + CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name)); + code << tab << "// ---- Input field " << i << ": " << field_name << "\n"; + code << tab << "CeedScalar *r_s_in_" << i << " = r_q_in_" << i << ";\n"; } - code << " // -- Output fields --\n"; + code << tab << "// -- Output fields\n"; for (CeedInt i = 0; i < num_output_fields; i++) { - code << " // ---- Output field " << i << " ----\n"; - code << " CeedScalar* r_qq_" << i << " = r_tt_" << i << ";\n"; + const char *field_name; + + CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name)); + code << tab << "// ---- Output field " << i << ": " << field_name << "\n"; + code << tab << "CeedScalar *r_s_out_" << i << " = r_q_out_" << i << ";\n"; } } - code << "\n // -- QFunction Inputs and outputs --\n"; - code << " CeedScalar* in[" << num_input_fields << "];\n"; + + // Input and output buffers + code << "\n"; + code << tab << "// -- QFunction inputs and outputs\n"; + code << tab << "// ---- Inputs\n"; + code << tab << "CeedScalar *inputs[" << CeedIntMax(num_input_fields, 1) << "];\n"; for (CeedInt i = 0; i < num_input_fields; i++) { - code << " // ---- Input field " << i << " ----\n"; - code << " in[" << i << "] = r_q_" << i << ";\n"; + const char *field_name; + + CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name)); + code << tab << "// ------ Input field " << i << ": " << field_name << "\n"; + code << tab << "inputs[" << i << "] = r_s_in_" << i << ";\n"; } - code << " CeedScalar* out[" << num_output_fields << "];\n"; + code << tab << "// ---- Outputs\n"; + code << tab << "CeedScalar *outputs[" << CeedIntMax(num_output_fields, 1) << "];\n"; for (CeedInt i = 0; i < num_output_fields; i++) { - code << " // ---- Output field " << i << " ----\n"; - code << " out[" << i << "] = r_qq_" << i << ";\n"; + const char *field_name; + + CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name)); + code << tab << "// ------ Output field " << i << ": " << field_name << "\n"; + code << tab << "outputs[" << i << "] = r_s_out_" << i << ";\n"; } - code << "\n // -- Apply QFunction --\n"; - code << " " << qfunction_name << "(ctx, "; - if (dim != 3 || use_collograd_parallelization) { + + // Apply QFunction + code << "\n"; + code << tab << "// -- Apply QFunction\n"; +#ifdef __HIP_PLATFORM_SPIRV__ + code << tab << "if (elem < num_elem) {\n"; + tab.push(); +#endif + code << tab << "" << qfunction_name << "(ctx, "; + if (max_dim != 3 || is_at_points || use_3d_slices || !is_all_tensor) { code << "1"; } else { - code << "Q_1d"; + code << Q_name; } - code << ", in, out);\n"; - if (use_collograd_parallelization) { - code << " // -- Output fields --\n"; + code << ", inputs, outputs);\n"; +#ifdef __HIP_PLATFORM_SPIRV__ + tab.pop(); + code << tab << "}\n"; +#endif + + if (is_at_points) { + // Map back to coefficients + code << "\n"; + code << tab << "// -- Output fields\n"; for (CeedInt i = 0; i < num_output_fields; i++) { - code << " // ---- Output field " << i << " ----\n"; + const char *field_name; + std::string var_suffix = "_out_" + std::to_string(i); + std::string P_name = "P_1d" + var_suffix; + + CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name)); + code << tab << "// ---- Output field " << i << ": " << field_name << "\n"; CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); // Basis action - code << " // EvalMode: " << CeedEvalModes[eval_mode] << "\n"; + code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n"; switch (eval_mode) { - case CEED_EVAL_NONE: - code << " for (CeedInt j = 0; j < num_comp_out_" << i << " ; ++j) {\n"; - code << " r_tt_" << i << "[q + j*Q_1d] = r_qq_" << i << "[j];\n"; - code << " }\n"; - break; // No action + case CEED_EVAL_NONE: { + CeedInt comp_stride; + CeedElemRestriction elem_rstr; + + if (is_assemble) break; + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr)); + CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride)); + CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); + code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n"; + code << tab << "WritePoint(data, elem, i, points.num_per_elem[elem], indices.outputs[" << i << "]" + << ", r_s" << var_suffix << ", d" << var_suffix << ");\n"; + break; + } case CEED_EVAL_INTERP: - code << " for (CeedInt j = 0; j < num_comp_out_" << i << " ; ++j) {\n"; - code << " r_tt_" << i << "[q + j*Q_1d] = r_qq_" << i << "[j];\n"; - code << " }\n"; + code << tab << "if (i >= points.num_per_elem[elem]) {\n"; + tab.push(); + code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << "; j++) r_s" << var_suffix << "[j] = 0.0;\n"; + tab.pop(); + code << tab << "}\n"; + code << tab << "InterpTransposeAtPoints" << max_dim << "d(data, i, r_s" << var_suffix << ", r_x, r_c" << var_suffix << ");\n"; break; case CEED_EVAL_GRAD: - code << " gradColloTranspose3d(data, q, r_qq_" << i << ", s_G_out_" << i << ", r_tt_" << i << ");\n"; + code << tab << "if (i >= points.num_per_elem[elem]) {\n"; + tab.push(); + code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << "*dim" << var_suffix << "; j++) r_s" << var_suffix << "[j] = 0.0;\n"; + tab.pop(); + code << tab << "}\n"; + code << tab << "GradTransposeAtPoints" << max_dim << "d(data, i, r_s" << var_suffix << ", r_x, r_c" << var_suffix << ");\n"; break; + // LCOV_EXCL_START case CEED_EVAL_WEIGHT: break; // Should not occur case CEED_EVAL_DIV: + case CEED_EVAL_CURL: break; // TODO: Not implemented + // LCOV_EXCL_STOP + } + } + } else if (use_3d_slices) { + // Copy or apply transpose grad, if needed + code << "\n"; + code << tab << "// -- Output fields\n"; + for (CeedInt i = 0; i < num_output_fields; i++) { + const char *field_name; + std::string var_suffix = "_out_" + std::to_string(i); + std::string P_name = "P_1d" + var_suffix; + + CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name)); + code << tab << "// ---- Output field " << i << ": " << field_name << "\n"; + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); + // Basis action + code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n"; + switch (eval_mode) { + case CEED_EVAL_NONE: + code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << " ; j++) {\n"; + tab.push(); + code << tab << "r_q" << var_suffix << "[q + j*" << Q_name << "] = r_s" << var_suffix << "[j];\n"; + tab.pop(); + code << tab << "}\n"; + break; + case CEED_EVAL_INTERP: + code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << " ; j++) {\n"; + tab.push(); + code << tab << "r_q" << var_suffix << "[q + j*" << Q_name << "] = r_s" << var_suffix << "[j];\n"; + tab.pop(); + code << tab << "}\n"; + break; + case CEED_EVAL_GRAD: + code << tab << "GradColloSliceTranspose3d(data, q, r_s" << var_suffix << ", s_G" + << var_suffix << ", r_q" << var_suffix << ");\n"; + break; + // LCOV_EXCL_START + case CEED_EVAL_WEIGHT: + break; // Should not occur + case CEED_EVAL_DIV: case CEED_EVAL_CURL: break; // TODO: Not implemented + // LCOV_EXCL_STOP } } - code << " }\n"; } + tab.pop(); + code << tab << "}\n"; + return CEED_ERROR_SUCCESS; +} - // Output basis apply if needed - // Generate the correct eval mode code for each output - code << "\n // -- Output field basis action and restrictions --\n"; - for (CeedInt i = 0; i < num_output_fields; i++) { - code << " // ---- Output field " << i << " ----\n"; - // Get elem_size, eval_mode, num_comp - CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr)); - CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); - CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); - CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); - // TODO put in a function - // Basis action - code << " // EvalMode: " << CeedEvalModes[eval_mode] << "\n"; - switch (eval_mode) { - case CEED_EVAL_NONE: - code << " CeedScalar* r_v_" << i << " = r_tt_" << i << ";\n"; - break; // No action - case CEED_EVAL_INTERP: - code << " CeedScalar r_v_" << i << "[num_comp_out_" << i << "*P_out_" << i << "];\n"; - code << " InterpTranspose" << (dim > 1 ? "Tensor" : "") << dim << "d(data, r_tt_" << i - << ", s_B_out_" << i << ", r_v_" << i << ");\n"; - break; - case CEED_EVAL_GRAD: - code << " CeedScalar r_v_" << i << "[num_comp_out_" << i << "*P_out_" << i << "];\n"; - if (use_collograd_parallelization) { - code << " InterpTranspose" << (dim > 1 ? "Tensor" : "") << dim << "d(data, r_tt_" << i - << ", s_B_out_" << i << ", r_v_" << i << ");\n"; - } else { - CeedInt P_1d; - CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); - CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); - code << " GradTranspose" << (dim > 1 ? "Tensor" : "") << (dim == 3 && Q_1d >= P_1d ? "Collocated" : "") << dim << "d(data, r_tt_" << i << ", s_B_out_" << i << ", s_G_out_" << i << ", r_v_" << i << ");\n"; - } - break; - // LCOV_EXCL_START - case CEED_EVAL_WEIGHT: { - return CeedError(CeedOperatorReturnCeed(op), CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); - break; // Should not occur - } - case CEED_EVAL_DIV: - case CEED_EVAL_CURL: { - return CeedError(CeedOperatorReturnCeed(op), CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]); - break; // Should not occur +//------------------------------------------------------------------------------ +// Build single operator kernel +//------------------------------------------------------------------------------ +extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op, bool *is_good_build) { + bool is_all_tensor = true, is_all_nontensor = true, is_at_points = false, use_3d_slices = false; + Ceed ceed; + CeedInt Q = 0, Q_1d = 0, num_input_fields, num_output_fields, max_dim = 1, max_num_points = 0, coords_comp_stride = 0; + CeedQFunctionField *qf_input_fields, *qf_output_fields; + CeedQFunction_Hip_gen *qf_data; + CeedQFunction qf; + CeedOperatorField *op_input_fields, *op_output_fields; + CeedOperator_Hip_gen *data; + std::ostringstream code; + Tab tab; + + CeedCallBackend(CeedOperatorGetData(op, &data)); + { + bool is_setup_done; + + CeedCallBackend(CeedOperatorIsSetupDone(op, &is_setup_done)); + if (is_setup_done) { + *is_good_build = !data->use_fallback; + return CEED_ERROR_SUCCESS; + } + } + + // Check field compatibility + CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); + { + bool has_shared_bases = true; + + for (CeedInt i = 0; i < num_input_fields; i++) { + CeedBasis basis; + + CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); + if (basis != CEED_BASIS_NONE) { + bool is_tensor = true; + const char *resource; + char *resource_root; + Ceed basis_ceed; + + CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor)); + is_all_tensor = is_all_tensor && is_tensor; + is_all_nontensor = is_all_nontensor && !is_tensor; + CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed)); + CeedCallBackend(CeedGetResource(basis_ceed, &resource)); + CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root)); + has_shared_bases = has_shared_bases && !strcmp(resource_root, "/gpu/hip/shared"); + CeedCallBackend(CeedFree(&resource_root)); + CeedCallBackend(CeedDestroy(&basis_ceed)); } - // LCOV_EXCL_STOP + CeedCallBackend(CeedBasisDestroy(&basis)); } - // TODO put in a function - // Restriction - bool is_strided; - CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided)); - if (!is_strided) { - CeedInt comp_stride; + for (CeedInt i = 0; i < num_output_fields; i++) { + CeedBasis basis; - CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size)); - code << " const CeedInt l_size_out_" << i << " = " << l_size << ";\n"; - CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride)); - code << " // CompStride: " << comp_stride << "\n"; - CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_data)); - data->indices.outputs[i] = (CeedInt *)rstr_data->d_offsets; - code << " writeDofsOffset" << dim << "d(data, l_size_out_" << i - << ", elem, indices.outputs[" << i << "], r_v_" << i << ", d_v_" << i << ");\n"; - } else { - bool has_backend_strides; - CeedInt num_elem; + CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); + if (basis != CEED_BASIS_NONE) { + bool is_tensor = true; + const char *resource; + char *resource_root; + Ceed basis_ceed; - CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides)); - CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem)); - CeedInt strides[3] = {1, elem_size * num_elem, elem_size}; + CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor)); + is_all_tensor = is_all_tensor && is_tensor; + is_all_nontensor = is_all_nontensor && !is_tensor; - if (!has_backend_strides) { - CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides)); + CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed)); + CeedCallBackend(CeedGetResource(basis_ceed, &resource)); + CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root)); + has_shared_bases = has_shared_bases && !strcmp(resource_root, "/gpu/hip/shared"); + CeedCallBackend(CeedFree(&resource_root)); + CeedCallBackend(CeedDestroy(&basis_ceed)); } - code << " // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n"; - code << " writeDofsStrided" << dim << "d(data, elem, r_v_" << i << ", d_v_" << i << ");\n"; + CeedCallBackend(CeedBasisDestroy(&basis)); + } + // -- Fallback to ref if not all bases are shared + if (!has_shared_bases) { + *is_good_build = false; + return CEED_ERROR_SUCCESS; + } + } + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); + CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); + CeedCallBackend(CeedQFunctionGetData(qf, &qf_data)); + CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); + + // Get operator data + CeedCallBackend(CeedOperatorIsAtPoints(op, &is_at_points)); + { + CeedInt max_P = 0, max_P_1d = 0; + + CeedCallBackend(CeedOperatorBuildKernelData_Hip_gen(ceed, num_input_fields, op_input_fields, qf_input_fields, num_output_fields, op_output_fields, + qf_output_fields, &max_P, &max_P_1d, &Q, &Q_1d, &max_dim, &is_all_tensor, &use_3d_slices)); + data->max_P_1d = is_all_tensor ? max_P_1d : max_P; + } + if (is_at_points) { + CeedInt coords_dim = 0; + CeedElemRestriction_Hip *rstr_data; + CeedElemRestriction rstr_points = NULL; + + CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL)); + CeedCallBackend(CeedElemRestrictionGetMaxPointsInElement(rstr_points, &max_num_points)); + CeedCallBackend(CeedElemRestrictionGetCompStride(rstr_points, &coords_comp_stride)); + CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr_points, &coords_dim)); + CeedCallBackend(CeedElemRestrictionGetData(rstr_points, &rstr_data)); + data->points.indices = (CeedInt *)rstr_data->d_offsets; + CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points)); + if (max_dim == 0) max_dim = coords_dim; + if (Q_1d == 0) max_num_points = ceil(pow(max_num_points, 1.0 / max_dim)); + } + if (max_dim == 0) max_dim = 1; + data->dim = max_dim; + if (is_at_points) use_3d_slices = false; + if (Q_1d == 0) { + if (is_at_points) Q_1d = max_num_points; + else CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q_1d)); + } + if (Q == 0) Q = Q_1d; + data->Q = Q; + data->Q_1d = Q_1d; + + // Check for restriction only identity operator + { + bool is_identity_qf; + + CeedCallBackend(CeedQFunctionIsIdentity(qf, &is_identity_qf)); + if (is_identity_qf) { + CeedEvalMode eval_mode_in, eval_mode_out; + + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[0], &eval_mode_in)); + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[0], &eval_mode_out)); + CeedCheck(eval_mode_in != CEED_EVAL_NONE || eval_mode_out != CEED_EVAL_NONE, ceed, CEED_ERROR_BACKEND, + "Backend does not implement restriction only identity operators"); } } - code << " }\n"; - code << "}\n"; - code << "// -----------------------------------------------------------------------------\n\n"; + // Load basis source files + if (!is_all_nontensor) { + code << tab << "// Tensor basis source\n"; + code << tab << "#include \n\n"; + } + if (!is_all_tensor) { + code << tab << "// Non-tensor basis source\n"; + code << tab << "#include \n\n"; + } + if (is_at_points) { + code << tab << "// AtPoints basis source\n"; + code << tab << "#include \n\n"; + } + if (!is_all_tensor && !is_all_nontensor) { + code << tab << "// Tensor basis source\n"; + code << tab << "#include \n\n"; + } + code << tab << "// CodeGen operator source\n"; + code << tab << "#include \n\n"; - // View kernel for debugging - CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "Generated Operator Kernels:\n"); - CeedDebug(ceed, code.str().c_str()); + // Get QFunction name + std::string qfunction_name(qf_data->qfunction_name); + std::string operator_name; - CeedInt block_sizes[3] = {0, 0, 0}; - CeedInt num_elem; + operator_name = "CeedKernelHipGenOperator_" + qfunction_name; - CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem)); - CeedCallBackend(BlockGridCalculate_Hip_gen(dim, num_elem, data->max_P_1d, Q_1d, block_sizes)); - CeedCallBackend(CeedCompile_Hip(ceed, code.str().c_str(), &data->module, 2, "T_1D", block_sizes[0], "BLOCK_SIZE", - block_sizes[0] * block_sizes[1] * block_sizes[2])); - CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, operator_name.c_str(), &data->op)); + // Define CEED_Q_VLA + code << "\n" << tab << "#undef CEED_Q_VLA\n"; + if (max_dim != 3 || is_at_points || use_3d_slices || !is_all_tensor) { + code << tab << "#define CEED_Q_VLA 1\n\n"; + } else { + code << tab << "#define CEED_Q_VLA " << Q_1d << "\n\n"; + } - CeedCallBackend(CeedOperatorSetSetupDone(op)); + // Add user QFunction source + { + const char *source_path; + + CeedCallBackend(CeedQFunctionGetSourcePath(qf, &source_path)); + CeedCheck(source_path, ceed, CEED_ERROR_UNSUPPORTED, "/gpu/hip/gen backend requires QFunction source code file"); + + code << tab << "// User QFunction source\n"; + code << tab << "#include \"" << source_path << "\"\n\n"; + } + + // Setup + code << "\n" << tab << "// -----------------------------------------------------------------------------\n"; + code << tab << "// Operator Kernel\n"; + code << tab << "// \n"; + code << tab << "// d_[in,out]_i: CeedVector device array\n"; + code << tab << "// r_[in,out]_e_i: Element vector register\n"; + code << tab << "// r_[in,out]_q_i: Quadrature space vector register\n"; + code << tab << "// r_[in,out]_c_i: AtPoints Chebyshev coefficients register\n"; + code << tab << "// r_[in,out]_s_i: Quadrature space slice vector register\n"; + code << tab << "// \n"; + code << tab << "// s_B_[in,out]_i: Interpolation matrix, shared memory\n"; + code << tab << "// s_G_[in,out]_i: Gradient matrix, shared memory\n"; + code << tab << "// -----------------------------------------------------------------------------\n"; + code << tab << "extern \"C\" __launch_bounds__(BLOCK_SIZE)\n"; + code << "__global__ void " << operator_name + << "(CeedInt num_elem, void* ctx, FieldsInt_Hip indices, Fields_Hip fields, Fields_Hip B, Fields_Hip G, CeedScalar* W, Points_Hip points) {\n"; + tab.push(); + + // Scratch buffers + for (CeedInt i = 0; i < num_input_fields; i++) { + CeedEvalMode eval_mode; + + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); + if (eval_mode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT + code << tab << "const CeedScalar *__restrict__ d_in_" << i << " = fields.inputs[" << i << "];\n"; + } + } + for (CeedInt i = 0; i < num_output_fields; i++) { + code << tab << "CeedScalar *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n"; + } + + code << tab << "const CeedInt max_dim = " << max_dim << ";\n"; + if (!is_all_tensor) { + code << tab << "const CeedInt Q = " << Q << ";\n"; + } + if (!is_all_nontensor) { + code << tab << "const CeedInt Q_1d = " << Q_1d << ";\n"; + } + if (is_at_points) { + code << tab << "const CeedInt max_num_points = " << max_num_points << ";\n"; + code << tab << "const CeedInt coords_comp_stride = " << coords_comp_stride << ";\n"; + } + + // Shared data + code << tab << "extern __shared__ CeedScalar slice[];\n"; + code << tab << "SharedData_Hip data;\n"; + code << tab << "data.t_id_x = threadIdx.x;\n"; + code << tab << "data.t_id_y = threadIdx.y;\n"; + code << tab << "data.t_id_z = threadIdx.z;\n"; + code << tab << "data.t_id = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n"; + code << tab << "data.slice = slice + data.t_id_z*OP_T_1D" << ((!is_all_tensor || max_dim == 1) ? "" : "*OP_T_1D") << ";\n"; + + // -- Determine input mat reuse + FieldReuse_Hip input_matrix_reuse[CEED_FIELD_MAX]; + + for (CeedInt i = 0; i < num_input_fields; i++) { + input_matrix_reuse[i].index = -1; + } + for (CeedInt i = 0; i < num_input_fields; i++) { + bool is_tensor = true; + CeedEvalMode eval_mode_i; + CeedBasis basis_i; + + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode_i)); + if (eval_mode_i == CEED_EVAL_WEIGHT) continue; + CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis_i)); + CeedCallBackend(CeedBasisIsTensor(basis_i, &is_tensor)); + for (CeedInt j = 0; (input_matrix_reuse[i].index == -1) && (j < i); j++) { + CeedEvalMode eval_mode_j; + CeedBasis basis_j; + + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j)); + if (eval_mode_j == CEED_EVAL_WEIGHT) continue; + CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j)); + if (basis_i == basis_j) { + if (is_tensor) { + input_matrix_reuse[i].index = j; + input_matrix_reuse[i].is_input = true; + input_matrix_reuse[i].eval_mode = eval_mode_j; + } else { + // For non-tensor can only re-use with the same eval mode + if (eval_mode_i == eval_mode_j) { + input_matrix_reuse[i].index = j; + input_matrix_reuse[i].is_input = true; + input_matrix_reuse[i].eval_mode = eval_mode_j; + } + } + } + CeedCallBackend(CeedBasisDestroy(&basis_j)); + } + CeedCallBackend(CeedBasisDestroy(&basis_i)); + } + + // -- Determine output mat reuse + FieldReuse_Hip output_matrix_reuse[CEED_FIELD_MAX]; + + for (CeedInt i = 0; i < num_output_fields; i++) { + output_matrix_reuse[i].index = -1; + } + for (CeedInt i = 0; i < num_output_fields; i++) { + bool is_tensor = true; + CeedEvalMode eval_mode_i; + CeedBasis basis_i; + + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode_i)); + CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis_i)); + for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < num_input_fields); j++) { + CeedEvalMode eval_mode_j; + CeedBasis basis_j; + + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j)); + if (eval_mode_j == CEED_EVAL_WEIGHT) continue; + CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j)); + if (basis_i == basis_j) { + if (is_tensor) { + output_matrix_reuse[i].index = j; + output_matrix_reuse[i].is_input = true; + output_matrix_reuse[i].eval_mode = eval_mode_j; + } else { + // For non-tensor can only re-use with the same eval mode + if (eval_mode_i == eval_mode_j) { + output_matrix_reuse[i].index = j; + output_matrix_reuse[i].is_input = true; + output_matrix_reuse[i].eval_mode = eval_mode_j; + } + } + } + CeedCallBackend(CeedBasisDestroy(&basis_j)); + } + for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < i); j++) { + CeedEvalMode eval_mode_j; + CeedBasis basis_j; + + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[j], &eval_mode_j)); + if (eval_mode_j == CEED_EVAL_WEIGHT) continue; + CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[j], &basis_j)); + CeedCallBackend(CeedBasisIsTensor(basis_i, &is_tensor)); + if (basis_i == basis_j) { + if (is_tensor) { + output_matrix_reuse[i].index = j; + output_matrix_reuse[i].is_input = false; + output_matrix_reuse[i].eval_mode = eval_mode_j; + } else { + // For non-tensor can only re-use with the same eval mode + if (eval_mode_i == eval_mode_j) { + output_matrix_reuse[i].index = j; + output_matrix_reuse[i].is_input = false; + output_matrix_reuse[i].eval_mode = eval_mode_j; + } + } + } + CeedCallBackend(CeedBasisDestroy(&basis_j)); + } + CeedCallBackend(CeedBasisDestroy(&basis_i)); + } + + // Initialize constants, and matrices B and G + code << "\n" << tab << "// Input field constants and basis data\n"; + for (CeedInt i = 0; i < num_input_fields; i++) { + CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, tab, i, op_input_fields[i], qf_input_fields[i], input_matrix_reuse[i], + max_dim, Q, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices, false)); + } + code << "\n" << tab << "// Output field constants and basis data\n"; + for (CeedInt i = 0; i < num_output_fields; i++) { + CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], output_matrix_reuse[i], + max_dim, Q, Q_1d, false, is_all_tensor, is_at_points, use_3d_slices, false)); + } + + // Loop over all elements + code << "\n" << tab << "// Element loop\n"; + code << tab << "__syncthreads();\n"; +#ifdef __HIP_PLATFORM_SPIRV__ + code << tab << "CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z;\n"; +#else + code << tab << "for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {\n"; + tab.push(); +#endif + + // -- Compute minimum buffer space needed + CeedInt max_rstr_buffer_size = 1; + + for (CeedInt i = 0; i < num_input_fields; i++) { + CeedEvalMode eval_mode; + + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); + if (eval_mode != CEED_EVAL_NONE && eval_mode != CEED_EVAL_WEIGHT) { + CeedInt num_comp; + CeedElemRestriction elem_rstr; + + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr)); + CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); + max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1)); + CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); + } + } + for (CeedInt i = 0; i < num_output_fields; i++) { + CeedEvalMode eval_mode; + + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); + if (eval_mode != CEED_EVAL_NONE) { + CeedInt num_comp; + CeedElemRestriction elem_rstr; + + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr)); + CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); + max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1)); + CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); + } + } + code << tab << "// Scratch restriction buffer space\n"; + code << tab << "CeedScalar r_e_scratch[" << max_rstr_buffer_size << "];\n"; + + // -- Determine best input field processing order + CeedInt field_rstr_in_buffer[CEED_FIELD_MAX], input_field_order[CEED_FIELD_MAX]; + + for (CeedInt i = 0; i < num_input_fields; i++) { + field_rstr_in_buffer[i] = -1; + input_field_order[i] = -1; + } + { + bool is_ordered[CEED_FIELD_MAX]; + CeedInt curr_index = 0; + + for (CeedInt i = 0; i < num_input_fields; i++) is_ordered[i] = false; + for (CeedInt i = 0; i < num_input_fields; i++) { + CeedVector vec_i; + CeedElemRestriction rstr_i; + + if (is_ordered[i]) continue; + field_rstr_in_buffer[i] = i; + is_ordered[i] = true; + input_field_order[curr_index] = i; + curr_index++; + CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i)); + if (vec_i == CEED_VECTOR_NONE) continue; // CEED_EVAL_WEIGHT + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr_i)); + for (CeedInt j = i + 1; j < num_input_fields; j++) { + CeedVector vec_j; + CeedElemRestriction rstr_j; + + CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[j], &vec_j)); + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[j], &rstr_j)); + if (rstr_i == rstr_j && vec_i == vec_j) { + field_rstr_in_buffer[j] = i; + is_ordered[j] = true; + input_field_order[curr_index] = j; + curr_index++; + } + CeedCallBackend(CeedVectorDestroy(&vec_j)); + CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j)); + } + CeedCallBackend(CeedVectorDestroy(&vec_i)); + CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i)); + } + } + + // -- Input restriction and basis + code << "\n" << tab << "// -- Input field restrictions and basis actions\n"; + for (CeedInt i = 0; i < num_input_fields; i++) { + const char *field_name; + const CeedInt f = input_field_order[i]; + + CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[f], &field_name)); + code << tab << "// ---- Input field " << f << ": " << field_name << "\n"; + + // ---- Restriction + CeedCallBackend(CeedOperatorBuildKernelRestriction_Hip_gen(code, data, tab, f, field_rstr_in_buffer, op_input_fields[f], qf_input_fields[f], + max_dim, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices)); + + // ---- Basis action + CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, tab, f, op_input_fields[f], qf_input_fields[f], max_dim, Q_1d, true, + is_all_tensor, is_at_points, use_3d_slices)); + } + + // -- Q function + CeedCallBackend(CeedOperatorBuildKernelQFunction_Hip_gen(code, data, tab, max_dim, max_num_points, num_input_fields, op_input_fields, + qf_input_fields, num_output_fields, op_output_fields, qf_output_fields, qfunction_name, + Q_1d, is_all_tensor, is_at_points, use_3d_slices, false)); + + // -- Output basis and restriction + code << "\n" << tab << "// -- Output field basis action and restrictions\n"; + for (CeedInt i = 0; i < num_output_fields; i++) { + const char *field_name; + + CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name)); + code << tab << "// ---- Output field " << i << ": " << field_name << "\n"; + + // ---- Basis action + CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], max_dim, Q_1d, false, + is_all_tensor, is_at_points, use_3d_slices)); + + // ---- Restriction + CeedCallBackend(CeedOperatorBuildKernelRestriction_Hip_gen(code, data, tab, i, NULL, op_output_fields[i], qf_output_fields[i], max_dim, Q_1d, + false, is_all_tensor, is_at_points, use_3d_slices)); + } + + // Close loop and function +#ifndef __HIP_PLATFORM_SPIRV__ + tab.pop(); + code << tab << "}\n"; +#endif + tab.pop(); + code << tab << "}\n"; + code << tab << "// -----------------------------------------------------------------------------\n\n"; + + CeedInt block_sizes[3] = {0, 0, 0}; + CeedInt num_elem; + + // Compile + CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem)); + CeedCallBackend(BlockGridCalculate_Hip_gen(is_all_tensor ? max_dim : 1, num_elem, data->max_P_1d, is_all_tensor ? Q_1d : Q, block_sizes)); + { + bool is_compile_good = false; + + data->thread_1d = block_sizes[0]; + CeedCallBackend(CeedTryCompile_Hip(ceed, code.str().c_str(), &is_compile_good, &data->module, 2, "OP_T_1D", block_sizes[0], "BLOCK_SIZE", + block_sizes[0] * block_sizes[1] * block_sizes[2])); + if (is_compile_good) { + *is_good_build = true; + CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, operator_name.c_str(), &data->op)); + } else { + *is_good_build = false; + data->use_fallback = true; + } + } + CeedCallBackend(CeedOperatorSetSetupDone(op)); + CeedCallBackend(CeedDestroy(&ceed)); + CeedCallBackend(CeedQFunctionDestroy(&qf)); + return CEED_ERROR_SUCCESS; +} + +//------------------------------------------------------------------------------ +// Build AtPoints assembly operator kernel +//------------------------------------------------------------------------------ +static int CeedOperatorBuildKernelAssemblyAtPoints_Hip_gen(CeedOperator op, bool is_full, bool *is_good_build) { + bool is_all_tensor = true, is_at_points = false, use_3d_slices = false; + Ceed ceed; + CeedInt Q, Q_1d, num_input_fields, num_output_fields, max_dim = 1, max_num_points = 0, coords_comp_stride = 0; + CeedQFunctionField *qf_input_fields, *qf_output_fields; + CeedQFunction_Hip_gen *qf_data; + CeedQFunction qf; + CeedOperatorField *op_input_fields, *op_output_fields; + CeedOperator_Hip_gen *data; + std::ostringstream code; + Tab tab; + + // Check compatibility + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); + CeedCallBackend(CeedOperatorIsAtPoints(op, &is_at_points)); + CeedCheck(is_at_points, ceed, CEED_ERROR_BACKEND, "Only AtPoints operator assembly supported"); + + // Retrieve operator data + CeedCallBackend(CeedOperatorGetData(op, &data)); + Q = data->Q; + Q_1d = data->Q_1d; + max_dim = data->dim; + { + CeedElemRestriction rstr_points = NULL; + + CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL)); + CeedCallBackend(CeedElemRestrictionGetMaxPointsInElement(rstr_points, &max_num_points)); + CeedCallBackend(CeedElemRestrictionGetCompStride(rstr_points, &coords_comp_stride)); + CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points)); + } + CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); + CeedCallBackend(CeedQFunctionGetData(qf, &qf_data)); + CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); + CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); + + // Load basis source files + code << tab << "// Tensor basis source\n"; + code << tab << "#include \n\n"; + code << tab << "// AtPoints basis source\n"; + code << tab << "#include \n\n"; + code << tab << "// CodeGen operator source\n"; + code << tab << "#include \n\n"; + + // Get QFunction name + std::string qfunction_name(qf_data->qfunction_name); + std::string operator_name; + + if (is_full) { + operator_name = "CeedKernelHipGenOperatorFullAssembly_" + qfunction_name; + } else { + operator_name = "CeedKernelHipGenOperatorDiagonalAssembly_" + qfunction_name; + } + + // Define CEED_Q_VLA + code << "\n" << tab << "#undef CEED_Q_VLA\n"; + code << tab << "#define CEED_Q_VLA 1\n\n"; + + // Add user QFunction source + { + const char *source_path; + + CeedCallBackend(CeedQFunctionGetSourcePath(qf, &source_path)); + CeedCheck(source_path, ceed, CEED_ERROR_UNSUPPORTED, "/gpu/hip/gen backend requires QFunction source code file"); + + code << tab << "// User QFunction source\n"; + code << tab << "#include \"" << source_path << "\"\n\n"; + } + + // Setup + code << "\n" << tab << "// -----------------------------------------------------------------------------\n"; + code << tab << "// Operator Assembly Kernel\n"; + code << tab << "// \n"; + code << tab << "// d_[in,out]_i: CeedVector device array\n"; + code << tab << "// r_[in,out]_e_i: Element vector register\n"; + code << tab << "// r_[in,out]_q_i: Quadrature space vector register\n"; + code << tab << "// r_[in,out]_c_i: AtPoints Chebyshev coefficients register\n"; + code << tab << "// r_[in,out]_s_i: Quadrature space slice vector register\n"; + code << tab << "// \n"; + code << tab << "// s_B_[in,out]_i: Interpolation matrix, shared memory\n"; + code << tab << "// s_G_[in,out]_i: Gradient matrix, shared memory\n"; + code << tab << "// -----------------------------------------------------------------------------\n"; + code << tab << "extern \"C\" __global__ void " << operator_name + << "(CeedInt num_elem, void* ctx, FieldsInt_Hip indices, Fields_Hip fields, Fields_Hip B, Fields_Hip G, CeedScalar *W, Points_Hip " + "points, CeedScalar *__restrict__ values_array) {\n"; + tab.push(); + + // Scratch buffers + for (CeedInt i = 0; i < num_input_fields; i++) { + CeedEvalMode eval_mode; + + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); + if (eval_mode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT + code << tab << "const CeedScalar *__restrict__ d_in_" << i << " = fields.inputs[" << i << "];\n"; + } + } + for (CeedInt i = 0; i < num_output_fields; i++) { + code << tab << "CeedScalar *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n"; + } + + code << tab << "const CeedInt max_dim = " << max_dim << ";\n"; + code << tab << "const CeedInt Q_1d = " << Q_1d << ";\n"; + code << tab << "const CeedInt max_num_points = " << max_num_points << ";\n"; + code << tab << "const CeedInt coords_comp_stride = " << coords_comp_stride << ";\n"; + + // Shared data + code << tab << "extern __shared__ CeedScalar slice[];\n"; + code << tab << "SharedData_Hip data;\n"; + code << tab << "data.t_id_x = threadIdx.x;\n"; + code << tab << "data.t_id_y = threadIdx.y;\n"; + code << tab << "data.t_id_z = threadIdx.z;\n"; + code << tab << "data.t_id = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n"; + code << tab << "data.slice = slice + data.t_id_z*OP_T_1D" << ((!is_all_tensor || max_dim == 1) ? "" : "*OP_T_1D") << ";\n"; + + // -- Determine input mat reuse + FieldReuse_Hip input_matrix_reuse[CEED_FIELD_MAX]; + + for (CeedInt i = 0; i < num_input_fields; i++) { + input_matrix_reuse[i].index = -1; + } + for (CeedInt i = 0; i < num_input_fields; i++) { + CeedEvalMode eval_mode_i; + CeedBasis basis_i; + + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode_i)); + if (eval_mode_i == CEED_EVAL_WEIGHT) continue; + CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis_i)); + for (CeedInt j = 0; (input_matrix_reuse[i].index == -1) && (j < i); j++) { + CeedEvalMode eval_mode_j; + CeedBasis basis_j; + + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j)); + if (eval_mode_j == CEED_EVAL_WEIGHT) continue; + CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j)); + if (basis_i == basis_j) { + input_matrix_reuse[i].index = j; + input_matrix_reuse[i].is_input = true; + input_matrix_reuse[i].eval_mode = eval_mode_j; + } + CeedCallBackend(CeedBasisDestroy(&basis_j)); + } + CeedCallBackend(CeedBasisDestroy(&basis_i)); + } + + // -- Determine output mat reuse + FieldReuse_Hip output_matrix_reuse[CEED_FIELD_MAX]; + + for (CeedInt i = 0; i < num_output_fields; i++) { + output_matrix_reuse[i].index = -1; + } + for (CeedInt i = 0; i < num_output_fields; i++) { + CeedEvalMode eval_mode_i; + CeedBasis basis_i; + + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode_i)); + CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis_i)); + for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < num_input_fields); j++) { + CeedEvalMode eval_mode_j; + CeedBasis basis_j; + + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j)); + if (eval_mode_j == CEED_EVAL_WEIGHT) continue; + CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j)); + if (basis_i == basis_j) { + output_matrix_reuse[i].index = j; + output_matrix_reuse[i].is_input = true; + output_matrix_reuse[i].eval_mode = eval_mode_j; + } + CeedCallBackend(CeedBasisDestroy(&basis_j)); + } + for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < i); j++) { + CeedEvalMode eval_mode_j; + CeedBasis basis_j; + + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[j], &eval_mode_j)); + if (eval_mode_j == CEED_EVAL_WEIGHT) continue; + CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[j], &basis_j)); + if (basis_i == basis_j) { + output_matrix_reuse[i].index = j; + output_matrix_reuse[i].is_input = false; + output_matrix_reuse[i].eval_mode = eval_mode_j; + } + CeedCallBackend(CeedBasisDestroy(&basis_j)); + } + CeedCallBackend(CeedBasisDestroy(&basis_i)); + } + + // Initialize constants, and matrices B and G + code << "\n" << tab << "// Input field constants and basis data\n"; + for (CeedInt i = 0; i < num_input_fields; i++) { + CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, tab, i, op_input_fields[i], qf_input_fields[i], input_matrix_reuse[i], + max_dim, Q, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices, false)); + } + code << "\n" << tab << "// Output field constants and basis data\n"; + for (CeedInt i = 0; i < num_output_fields; i++) { + CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], output_matrix_reuse[i], + max_dim, Q, Q_1d, false, is_all_tensor, is_at_points, use_3d_slices, false)); + } + + // Loop over all elements + code << "\n" << tab << "// Element loop\n"; + code << tab << "__syncthreads();\n"; +#ifdef __HIP_PLATFORM_SPIRV__ + code << tab << "CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z;\n"; +#else + code << tab << "for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {\n"; + tab.push(); +#endif + + // -- Compute minimum buffer space needed + CeedInt max_rstr_buffer_size = 1; + + for (CeedInt i = 0; i < num_input_fields; i++) { + CeedEvalMode eval_mode; + + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); + if (eval_mode != CEED_EVAL_NONE && eval_mode != CEED_EVAL_WEIGHT) { + CeedInt num_comp; + CeedElemRestriction elem_rstr; + + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr)); + CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); + max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1)); + CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); + } + } + for (CeedInt i = 0; i < num_output_fields; i++) { + CeedEvalMode eval_mode; + + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); + if (eval_mode != CEED_EVAL_NONE) { + CeedInt num_comp; + CeedElemRestriction elem_rstr; + + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr)); + CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); + max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1)); + CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); + } + } + code << tab << "// Scratch restriction buffer space\n"; + code << tab << "CeedScalar r_e_scratch[" << max_rstr_buffer_size << "];\n"; + + // -- Determine best input field processing order + CeedInt field_rstr_in_buffer[CEED_FIELD_MAX], input_field_order[CEED_FIELD_MAX]; + + for (CeedInt i = 0; i < num_input_fields; i++) { + field_rstr_in_buffer[i] = -1; + input_field_order[i] = -1; + } + { + bool is_ordered[CEED_FIELD_MAX]; + CeedInt curr_index = 0; + + for (CeedInt i = 0; i < num_input_fields; i++) is_ordered[i] = false; + for (CeedInt i = 0; i < num_input_fields; i++) { + CeedVector vec_i; + CeedElemRestriction rstr_i; + + if (is_ordered[i]) continue; + field_rstr_in_buffer[i] = i; + is_ordered[i] = true; + input_field_order[curr_index] = i; + curr_index++; + CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i)); + if (vec_i == CEED_VECTOR_NONE) continue; // CEED_EVAL_WEIGHT + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr_i)); + for (CeedInt j = i + 1; j < num_input_fields; j++) { + CeedVector vec_j; + CeedElemRestriction rstr_j; + + CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[j], &vec_j)); + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[j], &rstr_j)); + if (rstr_i == rstr_j && vec_i == vec_j) { + field_rstr_in_buffer[j] = i; + is_ordered[j] = true; + input_field_order[curr_index] = j; + curr_index++; + } + CeedCallBackend(CeedVectorDestroy(&vec_j)); + CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j)); + } + CeedCallBackend(CeedVectorDestroy(&vec_i)); + CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i)); + } + } + + // -- Input restriction and basis + code << "\n" << tab << "// -- Input field restrictions and basis actions\n"; + CeedInt active_field_index = -1; + + for (CeedInt i = 0; i < num_input_fields; i++) { + bool is_active = false; + const char *field_name; + const CeedInt f = input_field_order[i]; + + { + CeedVector vec; + + CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[f], &vec)); + is_active = vec == CEED_VECTOR_ACTIVE; + CeedCallBackend(CeedVectorDestroy(&vec)); + } + + CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[f], &field_name)); + code << tab << "// ---- Input field " << f << ": " << field_name << "\n"; + + if (is_active) { + std::string var_suffix = "_in_" + std::to_string(f); + + code << tab << "// Active field - no restriction or basis action here\n"; + if (active_field_index == -1) { + active_field_index = f; + code << tab << "CeedScalar r_e" << var_suffix << "[num_comp" << var_suffix << "*" << (max_dim >= 3 ? "P_1d" + var_suffix : "1") + << "] = {0.0};\n"; + } else { + code << tab << "CeedScalar *r_e" << var_suffix << " = r_e_in_" << active_field_index << ";\n"; + } + } else { + // ---- Restriction + CeedCallBackend(CeedOperatorBuildKernelRestriction_Hip_gen(code, data, tab, f, field_rstr_in_buffer, op_input_fields[f], qf_input_fields[f], + max_dim, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices)); + + // ---- Basis action + CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, tab, f, op_input_fields[f], qf_input_fields[f], max_dim, Q_1d, true, + is_all_tensor, is_at_points, use_3d_slices)); + } + } + + // -- Loop over active field + std::string active_var_suffix = "_in_" + std::to_string(active_field_index); + + code << "\n" << tab << "// Loop over nodes in active field\n"; + code << tab << "for (CeedInt n = 0; n < num_comp" << active_var_suffix << "*P_1d" << active_var_suffix + << (max_dim > 1 ? "*P_1d" + active_var_suffix : "") << (max_dim > 2 ? "*P_1d" + active_var_suffix : "") << "; n++) {\n"; + tab.push(); + + // -- Set current active node and component to 1 + code << tab << "// Set current active node and component to 1.0\n"; + code << tab << "SetEVecStandard" << max_dim << "d_Single(data, n, 1.0, r_e" + << active_var_suffix << ");\n\n"; + + for (CeedInt i = 0; i < num_input_fields; i++) { + bool is_active = false; + const char *field_name; + const CeedInt f = input_field_order[i]; + + { + CeedVector vec; + + CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[f], &vec)); + is_active = vec == CEED_VECTOR_ACTIVE; + CeedCallBackend(CeedVectorDestroy(&vec)); + } + if (!is_active) continue; + + CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[f], &field_name)); + code << tab << "// ---- Input field " << f << ": " << field_name << "\n"; + + // ---- Basis action + CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, tab, f, op_input_fields[f], qf_input_fields[f], max_dim, Q_1d, true, + is_all_tensor, is_at_points, use_3d_slices)); + } + + // -- Q function + CeedCallBackend(CeedOperatorBuildKernelQFunction_Hip_gen(code, data, tab, max_dim, max_num_points, num_input_fields, op_input_fields, + qf_input_fields, num_output_fields, op_output_fields, qf_output_fields, qfunction_name, + Q_1d, is_all_tensor, is_at_points, use_3d_slices, true)); + + // -- Output basis and restriction + code << "\n" << tab << "// -- Output field basis action and restrictions\n"; + for (CeedInt i = 0; i < num_output_fields; i++) { + bool is_active = false; + const char *field_name; + + { + CeedVector vec; + + CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec)); + is_active = vec == CEED_VECTOR_ACTIVE; + CeedCallBackend(CeedVectorDestroy(&vec)); + } + if (!is_active) continue; + + CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name)); + code << tab << "// ---- Output field " << i << ": " << field_name << "\n"; + + // ---- Basis action + CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], max_dim, Q_1d, false, + is_all_tensor, is_at_points, use_3d_slices)); + + // ---- Restriction + if (is_full) { + std::string var_suffix = "_out_" + std::to_string(i); + CeedInt comp_stride; + CeedSize l_size; + CeedElemRestriction elem_rstr; + + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr)); + CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size)); + code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n"; + CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride)); + code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n"; + code << tab << "WriteLVecStandard" << max_dim << "d_Assembly(data, l_size" << var_suffix << ", elem, n, r_e" << var_suffix << ", values_array);\n"; + CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); + } else { + std::string var_suffix = "_out_" + std::to_string(i); + CeedInt comp_stride; + CeedSize l_size; + CeedElemRestriction elem_rstr; + + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr)); + CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size)); + code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n"; + CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride)); + code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n"; + code << tab << "WriteLVecStandard" << max_dim << "d_Single(data, l_size" << var_suffix << ", elem, n, indices.outputs[" << i << "], r_e" << var_suffix << ", values_array);\n"; + CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); + } + } + + // -- Reset current active node and component + code << "\n" << tab << "// Reset current active node and component to 0.0\n"; + code << tab << "SetEVecStandard" << max_dim << "d_Single(data, n, 0.0, r_e" + << active_var_suffix << ");\n"; + + // -- End of loop over active field + tab.pop(); + code << tab << "}\n"; + + // Close loop and function +#ifndef __HIP_PLATFORM_SPIRV__ + tab.pop(); + code << tab << "}\n"; +#endif + tab.pop(); + code << tab << "}\n"; + code << tab << "// -----------------------------------------------------------------------------\n\n"; + + CeedInt block_sizes[3] = {0, 0, 0}; + CeedInt num_elem; + + // Compile + CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem)); + CeedCallBackend(BlockGridCalculate_Hip_gen(max_dim, num_elem, data->max_P_1d, Q_1d, block_sizes)); + { + bool is_compile_good = false; + + data->thread_1d = block_sizes[0]; + CeedCallBackend(CeedTryCompile_Hip(ceed, code.str().c_str(), &is_compile_good, + is_full ? &data->module_assemble_full : &data->module_assemble_diagonal, 2, "OP_T_1D", block_sizes[0], + "BLOCK_SIZE", block_sizes[0] * block_sizes[1] * block_sizes[2])); + if (is_compile_good) { + *is_good_build = true; + CeedCallBackend(CeedGetKernel_Hip(ceed, is_full ? data->module_assemble_full : data->module_assemble_diagonal, operator_name.c_str(), + is_full ? &data->assemble_full : &data->assemble_diagonal)); + } else { + *is_good_build = false; + data->use_assembly_fallback = true; + } + } + CeedCallBackend(CeedDestroy(&ceed)); + CeedCallBackend(CeedQFunctionDestroy(&qf)); + return CEED_ERROR_SUCCESS; +} + +extern "C" int CeedOperatorBuildKernelDiagonalAssemblyAtPoints_Hip_gen(CeedOperator op, bool *is_good_build) { + return CeedOperatorBuildKernelAssemblyAtPoints_Hip_gen(op, false, is_good_build); +} + +extern "C" int CeedOperatorBuildKernelFullAssemblyAtPoints_Hip_gen(CeedOperator op, bool *is_good_build) { + return CeedOperatorBuildKernelAssemblyAtPoints_Hip_gen(op, true, is_good_build); +} +//------------------------------------------------------------------------------ +// Build QFunction assembly operator kernel +//------------------------------------------------------------------------------ +extern "C" int CeedOperatorBuildKernelLinearAssembleQFunction_Hip_gen(CeedOperator op, bool *is_good_build) { + bool is_all_tensor = true, is_all_nontensor = true, is_at_points = false, use_3d_slices = false; + Ceed ceed; + CeedInt Q, Q_1d, num_input_fields, num_output_fields, max_dim = 1, max_num_points = 0; + CeedQFunctionField *qf_input_fields, *qf_output_fields; + CeedQFunction_Hip_gen *qf_data; + CeedQFunction qf; + CeedOperatorField *op_input_fields, *op_output_fields; + CeedOperator_Hip_gen *data; + std::ostringstream code; + Tab tab; + + // Check compatibility + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); + CeedCallBackend(CeedOperatorIsAtPoints(op, &is_at_points)); + CeedCheck(!is_at_points, ceed, CEED_ERROR_BACKEND, "AtPoints QFunction assembly is not supported"); + + // Check field compatibility + CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); + { + bool has_shared_bases = true; + + for (CeedInt i = 0; i < num_input_fields; i++) { + CeedBasis basis; + + CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); + if (basis != CEED_BASIS_NONE) { + bool is_tensor = true; + const char *resource; + char *resource_root; + Ceed basis_ceed; + + CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor)); + is_all_tensor = is_all_tensor && is_tensor; + is_all_nontensor = is_all_nontensor && !is_tensor; + CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed)); + CeedCallBackend(CeedGetResource(basis_ceed, &resource)); + CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root)); + has_shared_bases = has_shared_bases && !strcmp(resource_root, "/gpu/hip/shared"); + CeedCallBackend(CeedFree(&resource_root)); + CeedCallBackend(CeedDestroy(&basis_ceed)); + } + CeedCallBackend(CeedBasisDestroy(&basis)); + } + + for (CeedInt i = 0; i < num_output_fields; i++) { + CeedBasis basis; + + CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); + if (basis != CEED_BASIS_NONE) { + bool is_tensor = true; + const char *resource; + char *resource_root; + Ceed basis_ceed; + + CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor)); + is_all_tensor = is_all_tensor && is_tensor; + is_all_nontensor = is_all_nontensor && !is_tensor; + + CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed)); + CeedCallBackend(CeedGetResource(basis_ceed, &resource)); + CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root)); + has_shared_bases = has_shared_bases && !strcmp(resource_root, "/gpu/hip/shared"); + CeedCallBackend(CeedFree(&resource_root)); + CeedCallBackend(CeedDestroy(&basis_ceed)); + } + CeedCallBackend(CeedBasisDestroy(&basis)); + } + } + + // Retrieve operator data + CeedCallBackend(CeedOperatorGetData(op, &data)); + Q = data->Q; + Q_1d = data->Q_1d; + max_dim = data->dim; + CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); + CeedCallBackend(CeedQFunctionGetData(qf, &qf_data)); + CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); + + // Load basis source files + if (!is_all_nontensor) { + code << tab << "// Tensor basis source\n"; + code << tab << "#include \n\n"; + } + if (!is_all_tensor) { + code << tab << "// Non-tensor basis source\n"; + code << tab << "#include \n\n"; + } + if (!is_all_tensor && !is_all_nontensor) { + code << "// Tensor basis source\n"; + code << "#include \n\n"; + } + code << "// CodeGen operator source\n"; + code << "#include \n\n"; + + // Get QFunction name + std::string qfunction_name(qf_data->qfunction_name); + std::string operator_name; + + operator_name = "CeedKernelHipGenQFunctionAssembly_" + qfunction_name; + + // Define CEED_Q_VLA + code << "\n" << tab << "#undef CEED_Q_VLA\n"; + if (max_dim != 3 || is_at_points || use_3d_slices || !is_all_tensor) { + code << tab << "#define CEED_Q_VLA 1\n\n"; + } else { + code << tab << "#define CEED_Q_VLA " << Q_1d << "\n\n"; + } + + // Add user QFunction source + { + const char *source_path; + + CeedCallBackend(CeedQFunctionGetSourcePath(qf, &source_path)); + CeedCheck(source_path, ceed, CEED_ERROR_UNSUPPORTED, "/gpu/hip/gen backend requires QFunction source code file"); + + code << tab << "// User QFunction source\n"; + code << tab << "#include \"" << source_path << "\"\n\n"; + } + + // Setup + code << "\n" << tab << "// -----------------------------------------------------------------------------\n"; + code << tab << "// Operator Assembly Kernel\n"; + code << tab << "// \n"; + code << tab << "// d_[in,out]_i: CeedVector device array\n"; + code << tab << "// r_[in,out]_e_i: Element vector register\n"; + code << tab << "// r_[in,out]_q_i: Quadrature space vector register\n"; + code << tab << "// r_[in,out]_c_i: AtPoints Chebyshev coefficients register\n"; + code << tab << "// r_[in,out]_s_i: Quadrature space slice vector register\n"; + code << tab << "// \n"; + code << tab << "// s_B_[in,out]_i: Interpolation matrix, shared memory\n"; + code << tab << "// s_G_[in,out]_i: Gradient matrix, shared memory\n"; + code << tab << "// -----------------------------------------------------------------------------\n"; + code << tab << "extern \"C\" __global__ void " << operator_name + << "(CeedInt num_elem, void* ctx, FieldsInt_Hip indices, Fields_Hip fields, Fields_Hip B, Fields_Hip G, CeedScalar *W, Points_Hip " + "points, CeedScalar *__restrict__ values_array) {\n"; + tab.push(); + + // Scratch buffers + for (CeedInt i = 0; i < num_input_fields; i++) { + CeedEvalMode eval_mode; + + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); + if (eval_mode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT + code << tab << "const CeedScalar *__restrict__ d_in_" << i << " = fields.inputs[" << i << "];\n"; + } + } + for (CeedInt i = 0; i < num_output_fields; i++) { + bool is_active = false; + + { + CeedVector vec; + + CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec)); + is_active = vec == CEED_VECTOR_ACTIVE; + CeedCallBackend(CeedVectorDestroy(&vec)); + } + if (is_active) { + code << tab << "CeedScalar *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n"; + } + } + + code << tab << "const CeedInt max_dim = " << max_dim << ";\n"; + if (!is_all_tensor) { + code << tab << "const CeedInt Q = " << Q << ";\n"; + } + if (!is_all_nontensor) { + code << tab << "const CeedInt Q_1d = " << Q_1d << ";\n"; + } + + // Shared data + code << tab << "extern __shared__ CeedScalar slice[];\n"; + code << tab << "SharedData_Hip data;\n"; + code << tab << "data.t_id_x = threadIdx.x;\n"; + code << tab << "data.t_id_y = threadIdx.y;\n"; + code << tab << "data.t_id_z = threadIdx.z;\n"; + code << tab << "data.t_id = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n"; + code << tab << "data.slice = slice + data.t_id_z*OP_T_1D" << ((!is_all_tensor || max_dim == 1) ? "" : "*OP_T_1D") << ";\n"; + + // -- Determine input mat reuse + FieldReuse_Hip input_matrix_reuse[CEED_FIELD_MAX]; + + for (CeedInt i = 0; i < num_input_fields; i++) { + input_matrix_reuse[i].index = -1; + } + for (CeedInt i = 0; i < num_input_fields; i++) { + bool is_tensor = true; + CeedEvalMode eval_mode_i; + CeedBasis basis_i; + + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode_i)); + if (eval_mode_i == CEED_EVAL_WEIGHT) continue; + CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis_i)); + CeedCallBackend(CeedBasisIsTensor(basis_i, &is_tensor)); + for (CeedInt j = 0; (input_matrix_reuse[i].index == -1) && (j < i); j++) { + CeedEvalMode eval_mode_j; + CeedBasis basis_j; + + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j)); + if (eval_mode_j == CEED_EVAL_WEIGHT) continue; + CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j)); + if (basis_i == basis_j) { + if (is_tensor) { + input_matrix_reuse[i].index = j; + input_matrix_reuse[i].is_input = true; + input_matrix_reuse[i].eval_mode = eval_mode_j; + } else { + // For non-tensor can only re-use with the same eval mode + if (eval_mode_i == eval_mode_j) { + input_matrix_reuse[i].index = j; + input_matrix_reuse[i].is_input = true; + input_matrix_reuse[i].eval_mode = eval_mode_j; + } + } + } + CeedCallBackend(CeedBasisDestroy(&basis_j)); + } + CeedCallBackend(CeedBasisDestroy(&basis_i)); + } + + // -- Determine output mat reuse + FieldReuse_Hip output_matrix_reuse[CEED_FIELD_MAX]; + + for (CeedInt i = 0; i < num_output_fields; i++) { + output_matrix_reuse[i].index = -1; + } + for (CeedInt i = 0; i < num_output_fields; i++) { + bool is_tensor = true; + CeedEvalMode eval_mode_i; + CeedBasis basis_i; + + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode_i)); + CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis_i)); + CeedCallBackend(CeedBasisIsTensor(basis_i, &is_tensor)); + for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < num_input_fields); j++) { + CeedEvalMode eval_mode_j; + CeedBasis basis_j; + + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j)); + if (eval_mode_j == CEED_EVAL_WEIGHT) continue; + CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j)); + if (basis_i == basis_j) { + if (is_tensor) { + output_matrix_reuse[i].index = j; + output_matrix_reuse[i].is_input = true; + output_matrix_reuse[i].eval_mode = eval_mode_j; + } else { + // For non-tensor can only re-use with the same eval mode + if (eval_mode_i == eval_mode_j) { + output_matrix_reuse[i].index = j; + output_matrix_reuse[i].is_input = true; + output_matrix_reuse[i].eval_mode = eval_mode_j; + } + } + } + CeedCallBackend(CeedBasisDestroy(&basis_j)); + } + for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < i); j++) { + CeedEvalMode eval_mode_j; + CeedBasis basis_j; + + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[j], &eval_mode_j)); + if (eval_mode_j == CEED_EVAL_WEIGHT) continue; + CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[j], &basis_j)); + if (basis_i == basis_j) { + if (is_tensor) { + output_matrix_reuse[i].index = j; + output_matrix_reuse[i].is_input = false; + output_matrix_reuse[i].eval_mode = eval_mode_j; + } else { + // For non-tensor can only re-use with the same eval mode + if (eval_mode_i == eval_mode_j) { + output_matrix_reuse[i].index = j; + output_matrix_reuse[i].is_input = false; + output_matrix_reuse[i].eval_mode = eval_mode_j; + } + } + } + CeedCallBackend(CeedBasisDestroy(&basis_j)); + } + CeedCallBackend(CeedBasisDestroy(&basis_i)); + } + + // Initialize constants, and matrices B and G + code << "\n" << tab << "// Input field constants and basis data\n"; + for (CeedInt i = 0; i < num_input_fields; i++) { + CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, tab, i, op_input_fields[i], qf_input_fields[i], input_matrix_reuse[i], + max_dim, Q, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices, true)); + } + code << "\n" << tab << "// Output field constants and basis data\n"; + for (CeedInt i = 0; i < num_output_fields; i++) { + CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], output_matrix_reuse[i], + max_dim, Q, Q_1d, false, is_all_tensor, is_at_points, use_3d_slices, true)); + } + + // Loop over all elements + code << "\n" << tab << "// Element loop\n"; + code << tab << "__syncthreads();\n"; +#ifdef __HIP_PLATFORM_SPIRV__ + code << tab << "CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z;\n"; +#else + code << tab << "for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {\n"; + tab.push(); +#endif + + // -- Compute minimum buffer space needed + CeedInt max_rstr_buffer_size = 1; + + for (CeedInt i = 0; i < num_input_fields; i++) { + CeedEvalMode eval_mode; + + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); + if (eval_mode != CEED_EVAL_NONE && eval_mode != CEED_EVAL_WEIGHT) { + CeedInt num_comp; + CeedElemRestriction elem_rstr; + + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr)); + CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); + max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1)); + CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); + } + } + for (CeedInt i = 0; i < num_output_fields; i++) { + CeedEvalMode eval_mode; + + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); + if (eval_mode != CEED_EVAL_NONE) { + CeedInt num_comp; + CeedElemRestriction elem_rstr; + + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr)); + CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); + max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1)); + CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); + } + } + code << tab << "// Scratch restriction buffer space\n"; + code << tab << "CeedScalar r_e_scratch[" << max_rstr_buffer_size << "];\n"; + + // -- Determine best input field processing order + CeedInt field_rstr_in_buffer[CEED_FIELD_MAX], input_field_order[CEED_FIELD_MAX]; + + for (CeedInt i = 0; i < num_input_fields; i++) { + field_rstr_in_buffer[i] = -1; + input_field_order[i] = -1; + } + { + bool is_ordered[CEED_FIELD_MAX]; + CeedInt curr_index = 0; + + for (CeedInt i = 0; i < num_input_fields; i++) is_ordered[i] = false; + for (CeedInt i = 0; i < num_input_fields; i++) { + CeedVector vec_i; + CeedElemRestriction rstr_i; + + if (is_ordered[i]) continue; + field_rstr_in_buffer[i] = i; + is_ordered[i] = true; + input_field_order[curr_index] = i; + curr_index++; + CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i)); + if (vec_i == CEED_VECTOR_NONE) continue; // CEED_EVAL_WEIGHT + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr_i)); + for (CeedInt j = i + 1; j < num_input_fields; j++) { + CeedVector vec_j; + CeedElemRestriction rstr_j; + + CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[j], &vec_j)); + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[j], &rstr_j)); + if (rstr_i == rstr_j && vec_i == vec_j) { + field_rstr_in_buffer[j] = i; + is_ordered[j] = true; + input_field_order[curr_index] = j; + curr_index++; + } + CeedCallBackend(CeedVectorDestroy(&vec_j)); + CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j)); + } + CeedCallBackend(CeedVectorDestroy(&vec_i)); + CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i)); + } + } + + // -- Input restriction and basis + code << "\n" << tab << "// -- Input field restrictions and basis actions\n"; + CeedInt num_active_in = 0, num_active_out = 0, qf_assembly_size_out = 0; + CeedInt active_fields_in[CEED_FIELD_MAX], active_fields_out[CEED_FIELD_MAX]; + + for (CeedInt i = 0; i < num_input_fields; i++) { + bool is_active = false; + const char *field_name; + const CeedInt f = input_field_order[i]; + + { + CeedVector vec; + + CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[f], &vec)); + is_active = vec == CEED_VECTOR_ACTIVE; + CeedCallBackend(CeedVectorDestroy(&vec)); + } + + CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[f], &field_name)); + code << tab << "// ---- Input field " << f << ": " << field_name << "\n"; + + if (is_active) { + CeedEvalMode eval_mode; + CeedInt field_size; + + active_fields_in[num_active_in] = f; + num_active_in++; + CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[f], &field_size)); + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[f], &eval_mode)); + if (eval_mode == CEED_EVAL_GRAD) { + code << tab << "CeedScalar r_q_in_" << f << "[num_comp_in_" << f << "*" << "dim_in_" << f << "*" + << (is_all_tensor && (max_dim >= 3) ? "Q_1d" : "1") << "] = {0.};\n"; + } else { + code << tab << "CeedScalar r_q_in_" << f << "[num_comp_in_" << f << "*" << (is_all_tensor && (max_dim >= 3) ? "Q_1d" : "1") << "] = {0.};\n"; + } + code << tab << "const CeedInt field_size_in_" << f << " = " << field_size << ";\n"; + } else { + // ---- Restriction + CeedCallBackend(CeedOperatorBuildKernelRestriction_Hip_gen(code, data, tab, f, field_rstr_in_buffer, op_input_fields[f], qf_input_fields[f], + max_dim, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices)); + + // ---- Basis action + CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, tab, f, op_input_fields[f], qf_input_fields[f], max_dim, Q_1d, true, + is_all_tensor, is_at_points, use_3d_slices)); + } + } + code << tab << "const CeedInt field_sizes_in[" << num_active_in << "] = {"; + for (CeedInt i = 0; i < num_active_in; i++) { + code << "field_size_in_" << active_fields_in[i] << (i < num_active_in - 1 ? ", " : ""); + } + code << "};\n"; + code << tab << "CeedScalar * r_q_in[" << num_active_in << "] = {"; + for (CeedInt i = 0; i < num_active_in; i++) { + code << "r_q_in_" << active_fields_in[i] << (i < num_active_in - 1 ? ", " : ""); + } + code << "};\n"; + + for (CeedInt i = 0; i < num_output_fields; i++) { + bool is_active = false; + + { + CeedVector vec; + + CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec)); + is_active = vec == CEED_VECTOR_ACTIVE; + CeedCallBackend(CeedVectorDestroy(&vec)); + } + if (is_active) { + const char *field_name; + CeedInt field_size; + + active_fields_out[num_active_out] = i; + num_active_out++; + CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &field_size)); + qf_assembly_size_out += field_size; + CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name)); + code << tab << "// ---- Output field " << i << ": " << field_name << "\n"; + code << tab << "const CeedInt field_size_out_" << i << " = " << field_size << ";\n"; + } + } + code << tab << "const CeedInt field_sizes_out[" << num_active_out << "] = {"; + for (CeedInt i = 0; i < num_active_out; i++) { + code << "field_size_out_" << active_fields_out[i] << (i < num_active_out - 1 ? ", " : ""); + } + code << "};\n"; + code << tab << "const CeedInt total_size_out = " << qf_assembly_size_out << ";\n"; + + // -- Loop over active field + code << "\n" << tab << "CeedInt input_offset = 0;\n"; + code << tab << "// Loop over active QFunction input fields\n"; + code << tab << "const CeedInt num_active_in = " << num_active_in << ";\n"; + code << tab << "for (CeedInt a = 0; a < num_active_in; a++) {\n"; + tab.push(); + + // -- Loop over size of active field + code << "\n" << tab << "// Loop over current active input field size\n"; + code << tab << "const CeedInt field_size_in = field_sizes_in[a];\n"; + code << tab << "for (CeedInt s = 0; s < field_size_in; s++) {\n"; + tab.push(); + + // -- Set current active point and component to 1 + code << tab << "// Set current active point and component to 1.0\n"; + if (is_all_tensor && (max_dim >= 3)) { + code << tab << "for (CeedInt i = 0; i < Q_1d; i++) r_q_in[a][i + s * Q_1d] = 1.0;\n"; + } else { + code << tab << "r_q_in[a][s] = 1.0;\n"; + } + + // -- Q function + CeedCallBackend(CeedOperatorBuildKernelQFunction_Hip_gen(code, data, tab, max_dim, max_num_points, num_input_fields, op_input_fields, + qf_input_fields, num_output_fields, op_output_fields, qf_output_fields, qfunction_name, + Q_1d, is_all_tensor, is_at_points, use_3d_slices, true)); + + // -- Output basis and restriction + code << "\n" << tab << "// -- Output field basis action and restrictions\n"; + CeedScalar offset = 0; + + for (CeedInt i = 0; i < num_output_fields; i++) { + bool is_active = false; + const char *field_name; + + { + CeedVector vec; + + CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec)); + is_active = vec == CEED_VECTOR_ACTIVE; + CeedCallBackend(CeedVectorDestroy(&vec)); + } + if (!is_active) continue; + + CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name)); + code << tab << "// ---- Output field " << i << ": " << field_name << "\n"; + + // ---- Restriction + CeedInt field_size; + + code << tab << "WriteLVecStandard" << (is_all_tensor ? max_dim : 1) << "d_QFAssembly(data, num_elem, elem, input_offset + s, " << offset << ", r_q_out_" << i << ", values_array);\n"; + CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &field_size)); + offset += field_size; + } + + // -- Reset current active node and component + code << "\n" << tab << "// Reset current active node and component to 0.0\n"; + if (is_all_tensor && (max_dim >= 3)) { + code << tab << "for (CeedInt i = 0; i < Q_1d; i++) r_q_in[a][i + s * Q_1d] = 0.0;\n"; + } else { + code << tab << "r_q_in[a][s] = 0.0;\n"; + } + + // -- End of loop over size of active field + tab.pop(); + code << tab << "}\n"; + code << tab << "input_offset += field_size_in;\n"; + + // -- End of loop over active field + tab.pop(); + code << tab << "}\n"; + + // Close loop and function +#ifndef __HIP_PLATFORM_SPIRV__ + tab.pop(); + code << tab << "}\n"; +#endif + tab.pop(); + code << tab << "}\n"; + code << tab << "// -----------------------------------------------------------------------------\n\n"; + + CeedInt block_sizes[3] = {0, 0, 0}; + CeedInt num_elem; + + // Compile + CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem)); + CeedCallBackend(BlockGridCalculate_Hip_gen(max_dim, num_elem, data->max_P_1d, Q_1d, block_sizes)); + { + bool is_compile_good = false; + + data->thread_1d = block_sizes[0]; + CeedCallBackend(CeedTryCompile_Hip(ceed, code.str().c_str(), &is_compile_good, &data->module_assemble_qfunction, 2, "OP_T_1D", block_sizes[0], + "BLOCK_SIZE", block_sizes[0] * block_sizes[1] * block_sizes[2])); + if (is_compile_good) { + *is_good_build = true; + CeedCallBackend(CeedGetKernel_Hip(ceed, data->module_assemble_qfunction, operator_name.c_str(), &data->assemble_qfunction)); + } else { + *is_good_build = false; + data->use_assembly_fallback = true; + } + } + CeedCallBackend(CeedDestroy(&ceed)); + CeedCallBackend(CeedQFunctionDestroy(&qf)); return CEED_ERROR_SUCCESS; } diff --git a/backends/hip-gen/ceed-hip-gen-operator-build.h b/backends/hip-gen/ceed-hip-gen-operator-build.h index c17ba46eeb..0bb7f20df3 100644 --- a/backends/hip-gen/ceed-hip-gen-operator-build.h +++ b/backends/hip-gen/ceed-hip-gen-operator-build.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -7,4 +7,7 @@ #pragma once CEED_INTERN int BlockGridCalculate_Hip_gen(CeedInt dim, CeedInt num_elem, CeedInt P_1d, CeedInt Q_1d, CeedInt *block_sizes); -CEED_INTERN int CeedOperatorBuildKernel_Hip_gen(CeedOperator op); +CEED_INTERN int CeedOperatorBuildKernel_Hip_gen(CeedOperator op, bool *is_good_build); +CEED_INTERN int CeedOperatorBuildKernelFullAssemblyAtPoints_Hip_gen(CeedOperator op, bool *is_good_build); +CEED_INTERN int CeedOperatorBuildKernelDiagonalAssemblyAtPoints_Hip_gen(CeedOperator op, bool *is_good_build); +CEED_INTERN int CeedOperatorBuildKernelLinearAssembleQFunction_Hip_gen(CeedOperator op, bool *is_good_build); diff --git a/backends/hip-gen/ceed-hip-gen-operator.c b/backends/hip-gen/ceed-hip-gen-operator.c index fcd58ed76d..7532ba55b6 100644 --- a/backends/hip-gen/ceed-hip-gen-operator.c +++ b/backends/hip-gen/ceed-hip-gen-operator.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -9,6 +9,7 @@ #include #include #include +#include #include "../hip/ceed-hip-common.h" #include "../hip/ceed-hip-compile.h" @@ -19,27 +20,51 @@ // Destroy operator //------------------------------------------------------------------------------ static int CeedOperatorDestroy_Hip_gen(CeedOperator op) { + Ceed ceed; CeedOperator_Hip_gen *impl; + bool is_composite; + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); CeedCallBackend(CeedOperatorGetData(op, &impl)); + CeedCallBackend(CeedOperatorIsComposite(op, &is_composite)); + if (is_composite) { + CeedInt num_suboperators; + + CeedCall(CeedOperatorCompositeGetNumSub(op, &num_suboperators)); + for (CeedInt i = 0; i < num_suboperators; i++) { + if (impl->streams[i]) CeedCallHip(ceed, hipStreamDestroy(impl->streams[i])); + impl->streams[i] = NULL; + } + } + if (impl->module) CeedCallHip(ceed, hipModuleUnload(impl->module)); + if (impl->module_assemble_full) CeedCallHip(ceed, hipModuleUnload(impl->module_assemble_full)); + if (impl->module_assemble_diagonal) CeedCallHip(ceed, hipModuleUnload(impl->module_assemble_diagonal)); + if (impl->module_assemble_qfunction) CeedCallHip(ceed, hipModuleUnload(impl->module_assemble_qfunction)); + if (impl->points.num_per_elem) CeedCallHip(ceed, hipFree((void **)impl->points.num_per_elem)); CeedCallBackend(CeedFree(&impl)); + CeedCallBackend(CeedDestroy(&ceed)); return CEED_ERROR_SUCCESS; } //------------------------------------------------------------------------------ // Apply and add to output //------------------------------------------------------------------------------ -static int CeedOperatorApplyAdd_Hip_gen(CeedOperator op, CeedVector input_vec, CeedVector output_vec, CeedRequest *request) { +static int CeedOperatorApplyAddCore_Hip_gen(CeedOperator op, hipStream_t stream, const CeedScalar *input_arr, CeedScalar *output_arr, + bool *is_run_good, CeedRequest *request) { + bool is_at_points, is_tensor; Ceed ceed; CeedInt num_elem, num_input_fields, num_output_fields; CeedEvalMode eval_mode; - CeedVector output_vecs[CEED_FIELD_MAX] = {NULL}; CeedQFunctionField *qf_input_fields, *qf_output_fields; CeedQFunction_Hip_gen *qf_data; CeedQFunction qf; CeedOperatorField *op_input_fields, *op_output_fields; CeedOperator_Hip_gen *data; + // Creation of the operator + CeedCallBackend(CeedOperatorBuildKernel_Hip_gen(op, is_run_good)); + if (!(*is_run_good)) return CEED_ERROR_SUCCESS; + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); CeedCallBackend(CeedOperatorGetData(op, &data)); CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); @@ -48,37 +73,21 @@ static int CeedOperatorApplyAdd_Hip_gen(CeedOperator op, CeedVector input_vec, C CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); - // Check for tensor-product bases - { - bool has_tensor_bases; - - CeedCallBackend(CeedOperatorHasTensorBases(op, &has_tensor_bases)); - // -- Fallback to ref if not all bases are tensor-product - if (!has_tensor_bases) { - CeedOperator op_fallback; - - CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "Falling back to /gpu/hip/ref CeedOperator due to non-tensor bases"); - CeedCallBackend(CeedOperatorGetFallback(op, &op_fallback)); - CeedCallBackend(CeedOperatorApplyAdd(op_fallback, input_vec, output_vec, request)); - return CEED_ERROR_SUCCESS; - } - } - - // Creation of the operator - CeedCallBackend(CeedOperatorBuildKernel_Hip_gen(op)); - // Input vectors for (CeedInt i = 0; i < num_input_fields; i++) { CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); if (eval_mode == CEED_EVAL_WEIGHT) { // Skip data->fields.inputs[i] = NULL; } else { + bool is_active; CeedVector vec; // Get input vector CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); - if (vec == CEED_VECTOR_ACTIVE) vec = input_vec; - CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->fields.inputs[i])); + is_active = vec == CEED_VECTOR_ACTIVE; + if (is_active) data->fields.inputs[i] = input_arr; + else CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->fields.inputs[i])); + CeedCallBackend(CeedVectorDestroy(&vec)); } } @@ -88,25 +97,48 @@ static int CeedOperatorApplyAdd_Hip_gen(CeedOperator op, CeedVector input_vec, C if (eval_mode == CEED_EVAL_WEIGHT) { // Skip data->fields.outputs[i] = NULL; } else { + bool is_active; CeedVector vec; // Get output vector CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec)); - if (vec == CEED_VECTOR_ACTIVE) vec = output_vec; - output_vecs[i] = vec; - // Check for multiple output modes - CeedInt index = -1; - for (CeedInt j = 0; j < i; j++) { - if (vec == output_vecs[j]) { - index = j; - break; - } - } - if (index == -1) { - CeedCallBackend(CeedVectorGetArray(vec, CEED_MEM_DEVICE, &data->fields.outputs[i])); - } else { - data->fields.outputs[i] = data->fields.outputs[index]; + is_active = vec == CEED_VECTOR_ACTIVE; + if (is_active) data->fields.outputs[i] = output_arr; + else CeedCallBackend(CeedVectorGetArray(vec, CEED_MEM_DEVICE, &data->fields.outputs[i])); + CeedCallBackend(CeedVectorDestroy(&vec)); + } + } + + // Point coordinates, if needed + CeedCallBackend(CeedOperatorIsAtPoints(op, &is_at_points)); + if (is_at_points) { + // Coords + CeedVector vec; + + CeedCallBackend(CeedOperatorAtPointsGetPoints(op, NULL, &vec)); + CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->points.coords)); + CeedCallBackend(CeedVectorDestroy(&vec)); + + // Points per elem + if (num_elem != data->points.num_elem) { + CeedInt *points_per_elem; + const CeedInt num_bytes = num_elem * sizeof(CeedInt); + CeedElemRestriction rstr_points = NULL; + + data->points.num_elem = num_elem; + CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL)); + CeedCallBackend(CeedCalloc(num_elem, &points_per_elem)); + for (CeedInt e = 0; e < num_elem; e++) { + CeedInt num_points_elem; + + CeedCallBackend(CeedElemRestrictionGetNumPointsInElement(rstr_points, e, &num_points_elem)); + points_per_elem[e] = num_points_elem; } + if (data->points.num_per_elem) CeedCallHip(ceed, hipFree((void **)data->points.num_per_elem)); + CeedCallHip(ceed, hipMalloc((void **)&data->points.num_per_elem, num_bytes)); + CeedCallHip(ceed, hipMemcpy((void *)data->points.num_per_elem, points_per_elem, num_bytes, hipMemcpyHostToDevice)); + CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points)); + CeedCallBackend(CeedFree(&points_per_elem)); } } @@ -114,29 +146,37 @@ static int CeedOperatorApplyAdd_Hip_gen(CeedOperator op, CeedVector input_vec, C CeedCallBackend(CeedQFunctionGetInnerContextData(qf, CEED_MEM_DEVICE, &qf_data->d_c)); // Apply operator - void *opargs[] = {(void *)&num_elem, &qf_data->d_c, &data->indices, &data->fields, &data->B, &data->G, &data->W}; - const CeedInt dim = data->dim; - const CeedInt Q_1d = data->Q_1d; - const CeedInt P_1d = data->max_P_1d; - const CeedInt thread_1d = CeedIntMax(Q_1d, P_1d); - CeedInt block_sizes[3]; - - CeedCallBackend(BlockGridCalculate_Hip_gen(dim, num_elem, P_1d, Q_1d, block_sizes)); - if (dim == 1) { + void *opargs[] = {(void *)&num_elem, &qf_data->d_c, &data->indices, &data->fields, &data->B, &data->G, &data->W, &data->points}; + + CeedCallBackend(CeedOperatorHasTensorBases(op, &is_tensor)); + CeedInt block_sizes[3] = {data->thread_1d, ((!is_tensor || data->dim == 1) ? 1 : data->thread_1d), -1}; + + if (is_tensor) { + CeedCallBackend(BlockGridCalculate_Hip_gen(data->dim, num_elem, data->max_P_1d, data->Q_1d, block_sizes)); + } else { + CeedInt elems_per_block = 64 * data->thread_1d > 256 ? 256 / data->thread_1d : 64; + + elems_per_block = elems_per_block > 0 ? elems_per_block : 1; + block_sizes[2] = elems_per_block; + } + if (data->dim == 1 || !is_tensor) { CeedInt grid = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0); - CeedInt sharedMem = block_sizes[2] * thread_1d * sizeof(CeedScalar); + CeedInt sharedMem = block_sizes[2] * data->thread_1d * sizeof(CeedScalar); - CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->op, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem, opargs)); - } else if (dim == 2) { + CeedCallBackend(CeedTryRunKernelDimShared_Hip(ceed, data->op, stream, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem, + is_run_good, opargs)); + } else if (data->dim == 2) { CeedInt grid = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0); - CeedInt sharedMem = block_sizes[2] * thread_1d * thread_1d * sizeof(CeedScalar); + CeedInt sharedMem = block_sizes[2] * data->thread_1d * data->thread_1d * sizeof(CeedScalar); - CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->op, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem, opargs)); - } else if (dim == 3) { + CeedCallBackend(CeedTryRunKernelDimShared_Hip(ceed, data->op, stream, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem, + is_run_good, opargs)); + } else if (data->dim == 3) { CeedInt grid = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0); - CeedInt sharedMem = block_sizes[2] * thread_1d * thread_1d * sizeof(CeedScalar); + CeedInt sharedMem = block_sizes[2] * data->thread_1d * data->thread_1d * sizeof(CeedScalar); - CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->op, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem, opargs)); + CeedCallBackend(CeedTryRunKernelDimShared_Hip(ceed, data->op, stream, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem, + is_run_good, opargs)); } // Restore input arrays @@ -144,11 +184,13 @@ static int CeedOperatorApplyAdd_Hip_gen(CeedOperator op, CeedVector input_vec, C CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); if (eval_mode == CEED_EVAL_WEIGHT) { // Skip } else { + bool is_active; CeedVector vec; CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); - if (vec == CEED_VECTOR_ACTIVE) vec = input_vec; - CeedCallBackend(CeedVectorRestoreArrayRead(vec, &data->fields.inputs[i])); + is_active = vec == CEED_VECTOR_ACTIVE; + if (!is_active) CeedCallBackend(CeedVectorRestoreArrayRead(vec, &data->fields.inputs[i])); + CeedCallBackend(CeedVectorDestroy(&vec)); } } @@ -157,26 +199,675 @@ static int CeedOperatorApplyAdd_Hip_gen(CeedOperator op, CeedVector input_vec, C CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); if (eval_mode == CEED_EVAL_WEIGHT) { // Skip } else { + bool is_active; CeedVector vec; CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec)); - if (vec == CEED_VECTOR_ACTIVE) vec = output_vec; - // Check for multiple output modes - CeedInt index = -1; - for (CeedInt j = 0; j < i; j++) { - if (vec == output_vecs[j]) { - index = j; - break; + is_active = vec == CEED_VECTOR_ACTIVE; + if (!is_active) CeedCallBackend(CeedVectorRestoreArray(vec, &data->fields.outputs[i])); + CeedCallBackend(CeedVectorDestroy(&vec)); + } + } + + // Restore point coordinates, if needed + if (is_at_points) { + CeedVector vec; + + CeedCallBackend(CeedOperatorAtPointsGetPoints(op, NULL, &vec)); + CeedCallBackend(CeedVectorRestoreArrayRead(vec, &data->points.coords)); + CeedCallBackend(CeedVectorDestroy(&vec)); + } + + // Restore context data + CeedCallBackend(CeedQFunctionRestoreInnerContextData(qf, &qf_data->d_c)); + + // Cleanup + CeedCallBackend(CeedDestroy(&ceed)); + CeedCallBackend(CeedQFunctionDestroy(&qf)); + if (!(*is_run_good)) data->use_fallback = true; + return CEED_ERROR_SUCCESS; +} + +static int CeedOperatorApplyAdd_Hip_gen(CeedOperator op, CeedVector input_vec, CeedVector output_vec, CeedRequest *request) { + bool is_run_good = false; + const CeedScalar *input_arr = NULL; + CeedScalar *output_arr = NULL; + + // Try to run kernel + if (input_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(input_vec, CEED_MEM_DEVICE, &input_arr)); + if (output_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArray(output_vec, CEED_MEM_DEVICE, &output_arr)); + CeedCallBackend(CeedOperatorApplyAddCore_Hip_gen(op, NULL, input_arr, output_arr, &is_run_good, request)); + if (input_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorRestoreArrayRead(input_vec, &input_arr)); + if (output_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorRestoreArray(output_vec, &output_arr)); + + // Fallback on unsuccessful run + if (!is_run_good) { + CeedOperator op_fallback; + + CeedDebug(CeedOperatorReturnCeed(op), "\nFalling back to /gpu/hip/ref CeedOperator for ApplyAdd\n"); + CeedCallBackend(CeedOperatorGetFallback(op, &op_fallback)); + CeedCallBackend(CeedOperatorApplyAdd(op_fallback, input_vec, output_vec, request)); + } + return CEED_ERROR_SUCCESS; +} + +static int CeedOperatorApplyAddComposite_Hip_gen(CeedOperator op, CeedVector input_vec, CeedVector output_vec, CeedRequest *request) { + bool is_run_good[CEED_COMPOSITE_MAX] = {false}, is_sequential; + CeedInt num_suboperators; + const CeedScalar *input_arr = NULL; + CeedScalar *output_arr = NULL; + Ceed ceed; + CeedOperator_Hip_gen *impl; + CeedOperator *sub_operators; + + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); + CeedCallBackend(CeedOperatorGetData(op, &impl)); + CeedCallBackend(CeedOperatorCompositeGetNumSub(op, &num_suboperators)); + CeedCallBackend(CeedOperatorCompositeGetSubList(op, &sub_operators)); + CeedCall(CeedOperatorCompositeIsSequential(op, &is_sequential)); + if (input_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(input_vec, CEED_MEM_DEVICE, &input_arr)); + if (output_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArray(output_vec, CEED_MEM_DEVICE, &output_arr)); + for (CeedInt i = 0; i < num_suboperators; i++) { + CeedInt num_elem = 0; + const CeedInt stream_index = is_sequential ? 0 : i; + + CeedCallBackend(CeedOperatorGetNumElements(sub_operators[i], &num_elem)); + if (num_elem > 0) { + if (!impl->streams[stream_index]) CeedCallHip(ceed, hipStreamCreate(&impl->streams[stream_index])); + CeedCallBackend(CeedOperatorApplyAddCore_Hip_gen(sub_operators[i], impl->streams[stream_index], input_arr, output_arr, &is_run_good[i], + request)); + } else { + is_run_good[i] = true; + } + } + if (is_sequential) CeedCallHip(ceed, hipStreamSynchronize(impl->streams[0])); + else { + for (CeedInt i = 0; i < num_suboperators; i++) { + if (impl->streams[i]) { + if (is_run_good[i]) CeedCallHip(ceed, hipStreamSynchronize(impl->streams[i])); + } + } + } + if (input_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorRestoreArrayRead(input_vec, &input_arr)); + if (output_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorRestoreArray(output_vec, &output_arr)); + CeedCallHip(ceed, hipDeviceSynchronize()); + + // Fallback on unsuccessful run + for (CeedInt i = 0; i < num_suboperators; i++) { + if (!is_run_good[i]) { + CeedOperator op_fallback; + + CeedDebug(ceed, "\nFalling back to /gpu/hip/ref CeedOperator for ApplyAdd\n"); + CeedCallBackend(CeedOperatorGetFallback(sub_operators[i], &op_fallback)); + CeedCallBackend(CeedOperatorApplyAdd(op_fallback, input_vec, output_vec, request)); + } + } + CeedCallBackend(CeedDestroy(&ceed)); + return CEED_ERROR_SUCCESS; +} + +//------------------------------------------------------------------------------ +// QFunction assembly +//------------------------------------------------------------------------------ +static int CeedOperatorLinearAssembleQFunctionCore_Hip_gen(CeedOperator op, bool build_objects, CeedVector *assembled, CeedElemRestriction *rstr, + CeedRequest *request) { + Ceed ceed; + CeedOperator_Hip_gen *data; + + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); + CeedCallBackend(CeedOperatorGetData(op, &data)); + + // Build the assembly kernel + if (!data->assemble_qfunction && !data->use_assembly_fallback) { + bool is_build_good = false; + + CeedCallBackend(CeedOperatorBuildKernel_Hip_gen(op, &is_build_good)); + if (is_build_good) CeedCallBackend(CeedOperatorBuildKernelLinearAssembleQFunction_Hip_gen(op, &is_build_good)); + if (!is_build_good) data->use_assembly_fallback = true; + } + + // Try assembly + if (!data->use_assembly_fallback) { + bool is_run_good = true; + Ceed_Hip *hip_data; + CeedInt num_elem, num_input_fields, num_output_fields; + CeedEvalMode eval_mode; + CeedScalar *assembled_array; + CeedQFunctionField *qf_input_fields, *qf_output_fields; + CeedQFunction_Hip_gen *qf_data; + CeedQFunction qf; + CeedOperatorField *op_input_fields, *op_output_fields; + + CeedCallBackend(CeedGetData(ceed, &hip_data)); + CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); + CeedCallBackend(CeedQFunctionGetData(qf, &qf_data)); + CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem)); + CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); + CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); + + // Input vectors + for (CeedInt i = 0; i < num_input_fields; i++) { + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); + if (eval_mode == CEED_EVAL_WEIGHT) { // Skip + data->fields.inputs[i] = NULL; + } else { + bool is_active; + CeedVector vec; + + // Get input vector + CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); + is_active = vec == CEED_VECTOR_ACTIVE; + if (is_active) data->fields.inputs[i] = NULL; + else CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->fields.inputs[i])); + CeedCallBackend(CeedVectorDestroy(&vec)); + } + } + + // Get context data + CeedCallBackend(CeedQFunctionGetInnerContextData(qf, CEED_MEM_DEVICE, &qf_data->d_c)); + + // Build objects if needed + if (build_objects) { + CeedInt qf_size_in = 0, qf_size_out = 0, Q; + + // Count number of active input fields + { + for (CeedInt i = 0; i < num_input_fields; i++) { + CeedInt field_size; + CeedVector vec; + + // Get input vector + CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); + // Check if active input + if (vec == CEED_VECTOR_ACTIVE) { + CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &field_size)); + qf_size_in += field_size; + } + CeedCallBackend(CeedVectorDestroy(&vec)); + } + CeedCheck(qf_size_in > 0, ceed, CEED_ERROR_BACKEND, "Cannot assemble QFunction without active inputs and outputs"); + } + + // Count number of active output fields + { + for (CeedInt i = 0; i < num_output_fields; i++) { + CeedInt field_size; + CeedVector vec; + + // Get output vector + CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec)); + // Check if active output + if (vec == CEED_VECTOR_ACTIVE) { + CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &field_size)); + qf_size_out += field_size; + } + CeedCallBackend(CeedVectorDestroy(&vec)); } + CeedCheck(qf_size_out > 0, ceed, CEED_ERROR_BACKEND, "Cannot assemble QFunction without active inputs and outputs"); + } + CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q)); + + // Actually build objects now + const CeedSize l_size = (CeedSize)num_elem * Q * qf_size_in * qf_size_out; + CeedInt strides[3] = {1, num_elem * Q, Q}; /* *NOPAD* */ + + // Create output restriction + CeedCallBackend(CeedElemRestrictionCreateStrided(ceed, num_elem, Q, qf_size_in * qf_size_out, + (CeedSize)qf_size_in * (CeedSize)qf_size_out * (CeedSize)num_elem * (CeedSize)Q, strides, + rstr)); + // Create assembled vector + CeedCallBackend(CeedVectorCreate(ceed, l_size, assembled)); + } + + // Assembly array + CeedCallBackend(CeedVectorGetArrayWrite(*assembled, CEED_MEM_DEVICE, &assembled_array)); + + // Assemble QFunction + bool is_tensor = false; + void *opargs[] = {(void *)&num_elem, &qf_data->d_c, &data->indices, &data->fields, &data->B, &data->G, &data->W, &data->points, &assembled_array}; + + CeedCallBackend(CeedOperatorHasTensorBases(op, &is_tensor)); + CeedInt block_sizes[3] = {data->thread_1d, ((!is_tensor || data->dim == 1) ? 1 : data->thread_1d), -1}; + + if (is_tensor) { + CeedCallBackend(BlockGridCalculate_Hip_gen(data->dim, num_elem, data->max_P_1d, data->Q_1d, block_sizes)); + } else { + CeedInt elems_per_block = 64 * data->thread_1d > 256 ? 256 / data->thread_1d : 64; + + elems_per_block = elems_per_block > 0 ? elems_per_block : 1; + block_sizes[2] = elems_per_block; + } + if (data->dim == 1 || !is_tensor) { + CeedInt grid = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0); + CeedInt sharedMem = block_sizes[2] * data->thread_1d * sizeof(CeedScalar); + + CeedCallBackend(CeedTryRunKernelDimShared_Hip(ceed, data->assemble_qfunction, NULL, grid, block_sizes[0], block_sizes[1], block_sizes[2], + sharedMem, &is_run_good, opargs)); + } else if (data->dim == 2) { + CeedInt grid = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0); + CeedInt sharedMem = block_sizes[2] * data->thread_1d * data->thread_1d * sizeof(CeedScalar); + + CeedCallBackend(CeedTryRunKernelDimShared_Hip(ceed, data->assemble_qfunction, NULL, grid, block_sizes[0], block_sizes[1], block_sizes[2], + sharedMem, &is_run_good, opargs)); + } else if (data->dim == 3) { + CeedInt grid = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0); + CeedInt sharedMem = block_sizes[2] * data->thread_1d * data->thread_1d * sizeof(CeedScalar); + + CeedCallBackend(CeedTryRunKernelDimShared_Hip(ceed, data->assemble_qfunction, NULL, grid, block_sizes[0], block_sizes[1], block_sizes[2], + sharedMem, &is_run_good, opargs)); + } + + // Restore input arrays + for (CeedInt i = 0; i < num_input_fields; i++) { + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); + if (eval_mode == CEED_EVAL_WEIGHT) { // Skip + } else { + bool is_active; + CeedVector vec; + + CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); + is_active = vec == CEED_VECTOR_ACTIVE; + if (!is_active) CeedCallBackend(CeedVectorRestoreArrayRead(vec, &data->fields.inputs[i])); + CeedCallBackend(CeedVectorDestroy(&vec)); } - if (index == -1) { - CeedCallBackend(CeedVectorRestoreArray(vec, &data->fields.outputs[i])); + } + + // Restore context data + CeedCallBackend(CeedQFunctionRestoreInnerContextData(qf, &qf_data->d_c)); + + // Restore assembly array + CeedCallBackend(CeedVectorRestoreArray(*assembled, &assembled_array)); + + // Cleanup + CeedCallBackend(CeedQFunctionDestroy(&qf)); + if (!is_run_good) { + data->use_assembly_fallback = true; + if (build_objects) { + CeedCallBackend(CeedVectorDestroy(assembled)); + CeedCallBackend(CeedElemRestrictionDestroy(rstr)); } } } + CeedCallBackend(CeedDestroy(&ceed)); - // Restore context data - CeedCallBackend(CeedQFunctionRestoreInnerContextData(qf, &qf_data->d_c)); + // Fallback, if needed + if (data->use_assembly_fallback) { + CeedOperator op_fallback; + + CeedDebug(CeedOperatorReturnCeed(op), "\nFalling back to /gpu/hip/ref CeedOperator for LinearAssembleQFunction\n"); + CeedCallBackend(CeedOperatorGetFallback(op, &op_fallback)); + CeedCallBackend(CeedOperatorLinearAssembleQFunctionBuildOrUpdateFallback(op_fallback, assembled, rstr, request)); + return CEED_ERROR_SUCCESS; + } + return CEED_ERROR_SUCCESS; +} + +static int CeedOperatorLinearAssembleQFunction_Hip_gen(CeedOperator op, CeedVector *assembled, CeedElemRestriction *rstr, CeedRequest *request) { + return CeedOperatorLinearAssembleQFunctionCore_Hip_gen(op, true, assembled, rstr, request); +} + +static int CeedOperatorLinearAssembleQFunctionUpdate_Hip_gen(CeedOperator op, CeedVector assembled, CeedElemRestriction rstr, CeedRequest *request) { + return CeedOperatorLinearAssembleQFunctionCore_Hip_gen(op, false, &assembled, &rstr, request); +} + +//------------------------------------------------------------------------------ +// AtPoints diagonal assembly +//------------------------------------------------------------------------------ +static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Hip_gen(CeedOperator op, CeedVector assembled, CeedRequest *request) { + Ceed ceed; + CeedOperator_Hip_gen *data; + + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); + CeedCallBackend(CeedOperatorGetData(op, &data)); + + // Build the assembly kernel + if (!data->assemble_diagonal && !data->use_assembly_fallback) { + bool is_build_good = false; + CeedInt num_active_bases_in, num_active_bases_out; + CeedOperatorAssemblyData assembly_data; + + CeedCallBackend(CeedOperatorGetOperatorAssemblyData(op, &assembly_data)); + CeedCallBackend(CeedOperatorAssemblyDataGetEvalModes(assembly_data, &num_active_bases_in, NULL, NULL, NULL, &num_active_bases_out, NULL, NULL, + NULL, NULL)); + if (num_active_bases_in == num_active_bases_out) { + CeedCallBackend(CeedOperatorBuildKernel_Hip_gen(op, &is_build_good)); + if (is_build_good) CeedCallBackend(CeedOperatorBuildKernelDiagonalAssemblyAtPoints_Hip_gen(op, &is_build_good)); + } + if (!is_build_good) data->use_assembly_fallback = true; + } + + // Try assembly + if (!data->use_assembly_fallback) { + bool is_run_good = true; + Ceed_Hip *hip_data; + CeedInt num_elem, num_input_fields, num_output_fields; + CeedEvalMode eval_mode; + CeedScalar *assembled_array; + CeedQFunctionField *qf_input_fields, *qf_output_fields; + CeedQFunction_Hip_gen *qf_data; + CeedQFunction qf; + CeedOperatorField *op_input_fields, *op_output_fields; + + CeedCallBackend(CeedGetData(ceed, &hip_data)); + CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); + CeedCallBackend(CeedQFunctionGetData(qf, &qf_data)); + CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem)); + CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); + CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); + + // Input vectors + for (CeedInt i = 0; i < num_input_fields; i++) { + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); + if (eval_mode == CEED_EVAL_WEIGHT) { // Skip + data->fields.inputs[i] = NULL; + } else { + bool is_active; + CeedVector vec; + + // Get input vector + CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); + is_active = vec == CEED_VECTOR_ACTIVE; + if (is_active) data->fields.inputs[i] = NULL; + else CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->fields.inputs[i])); + CeedCallBackend(CeedVectorDestroy(&vec)); + } + } + + // Point coordinates + { + CeedVector vec; + + CeedCallBackend(CeedOperatorAtPointsGetPoints(op, NULL, &vec)); + CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->points.coords)); + CeedCallBackend(CeedVectorDestroy(&vec)); + + // Points per elem + if (num_elem != data->points.num_elem) { + CeedInt *points_per_elem; + const CeedInt num_bytes = num_elem * sizeof(CeedInt); + CeedElemRestriction rstr_points = NULL; + + data->points.num_elem = num_elem; + CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL)); + CeedCallBackend(CeedCalloc(num_elem, &points_per_elem)); + for (CeedInt e = 0; e < num_elem; e++) { + CeedInt num_points_elem; + + CeedCallBackend(CeedElemRestrictionGetNumPointsInElement(rstr_points, e, &num_points_elem)); + points_per_elem[e] = num_points_elem; + } + if (data->points.num_per_elem) CeedCallHip(ceed, hipFree((void **)data->points.num_per_elem)); + CeedCallHip(ceed, hipMalloc((void **)&data->points.num_per_elem, num_bytes)); + CeedCallHip(ceed, hipMemcpy((void *)data->points.num_per_elem, points_per_elem, num_bytes, hipMemcpyHostToDevice)); + CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points)); + CeedCallBackend(CeedFree(&points_per_elem)); + } + } + + // Get context data + CeedCallBackend(CeedQFunctionGetInnerContextData(qf, CEED_MEM_DEVICE, &qf_data->d_c)); + + // Assembly array + CeedCallBackend(CeedVectorGetArray(assembled, CEED_MEM_DEVICE, &assembled_array)); + + // Assemble diagonal + void *opargs[] = {(void *)&num_elem, &qf_data->d_c, &data->indices, &data->fields, &data->B, &data->G, &data->W, &data->points, &assembled_array}; + + CeedInt block_sizes[3] = {data->thread_1d, (data->dim == 1 ? 1 : data->thread_1d), -1}; + + CeedCallBackend(BlockGridCalculate_Hip_gen(data->dim, num_elem, data->max_P_1d, data->Q_1d, block_sizes)); + block_sizes[2] = 1; + if (data->dim == 1) { + CeedInt grid = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0); + CeedInt sharedMem = block_sizes[2] * data->thread_1d * sizeof(CeedScalar); + + CeedCallBackend(CeedTryRunKernelDimShared_Hip(ceed, data->assemble_diagonal, NULL, grid, block_sizes[0], block_sizes[1], block_sizes[2], + sharedMem, &is_run_good, opargs)); + } else if (data->dim == 2) { + CeedInt grid = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0); + CeedInt sharedMem = block_sizes[2] * data->thread_1d * data->thread_1d * sizeof(CeedScalar); + + CeedCallBackend(CeedTryRunKernelDimShared_Hip(ceed, data->assemble_diagonal, NULL, grid, block_sizes[0], block_sizes[1], block_sizes[2], + sharedMem, &is_run_good, opargs)); + } else if (data->dim == 3) { + CeedInt grid = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0); + CeedInt sharedMem = block_sizes[2] * data->thread_1d * data->thread_1d * sizeof(CeedScalar); + + CeedCallBackend(CeedTryRunKernelDimShared_Hip(ceed, data->assemble_diagonal, NULL, grid, block_sizes[0], block_sizes[1], block_sizes[2], + sharedMem, &is_run_good, opargs)); + } + CeedCallHip(ceed, hipDeviceSynchronize()); + + // Restore input arrays + for (CeedInt i = 0; i < num_input_fields; i++) { + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); + if (eval_mode == CEED_EVAL_WEIGHT) { // Skip + } else { + bool is_active; + CeedVector vec; + + CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); + is_active = vec == CEED_VECTOR_ACTIVE; + if (!is_active) CeedCallBackend(CeedVectorRestoreArrayRead(vec, &data->fields.inputs[i])); + CeedCallBackend(CeedVectorDestroy(&vec)); + } + } + + // Restore point coordinates + { + CeedVector vec; + + CeedCallBackend(CeedOperatorAtPointsGetPoints(op, NULL, &vec)); + CeedCallBackend(CeedVectorRestoreArrayRead(vec, &data->points.coords)); + CeedCallBackend(CeedVectorDestroy(&vec)); + } + + // Restore context data + CeedCallBackend(CeedQFunctionRestoreInnerContextData(qf, &qf_data->d_c)); + + // Restore assembly array + CeedCallBackend(CeedVectorRestoreArray(assembled, &assembled_array)); + + // Cleanup + CeedCallBackend(CeedQFunctionDestroy(&qf)); + if (!is_run_good) data->use_assembly_fallback = true; + } + CeedCallBackend(CeedDestroy(&ceed)); + + // Fallback, if needed + if (data->use_assembly_fallback) { + CeedOperator op_fallback; + + CeedDebug(CeedOperatorReturnCeed(op), "\nFalling back to /gpu/hip/ref CeedOperator for AtPoints LinearAssembleAddDiagonal\n"); + CeedCallBackend(CeedOperatorGetFallback(op, &op_fallback)); + CeedCallBackend(CeedOperatorLinearAssembleAddDiagonal(op_fallback, assembled, request)); + return CEED_ERROR_SUCCESS; + } + return CEED_ERROR_SUCCESS; +} + +//------------------------------------------------------------------------------ +// AtPoints full assembly +//------------------------------------------------------------------------------ +static int CeedOperatorAssembleSingleAtPoints_Hip_gen(CeedOperator op, CeedInt offset, CeedVector assembled) { + Ceed ceed; + CeedOperator_Hip_gen *data; + + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); + CeedCallBackend(CeedOperatorGetData(op, &data)); + + // Build the assembly kernel + if (!data->assemble_full && !data->use_assembly_fallback) { + bool is_build_good = false; + CeedInt num_active_bases_in, num_active_bases_out; + CeedOperatorAssemblyData assembly_data; + + CeedCallBackend(CeedOperatorGetOperatorAssemblyData(op, &assembly_data)); + CeedCallBackend(CeedOperatorAssemblyDataGetEvalModes(assembly_data, &num_active_bases_in, NULL, NULL, NULL, &num_active_bases_out, NULL, NULL, + NULL, NULL)); + if (num_active_bases_in == num_active_bases_out) { + CeedCallBackend(CeedOperatorBuildKernel_Hip_gen(op, &is_build_good)); + if (is_build_good) CeedCallBackend(CeedOperatorBuildKernelFullAssemblyAtPoints_Hip_gen(op, &is_build_good)); + } + if (!is_build_good) { + CeedDebug(ceed, "Single Operator Assemble at Points compile failed, using fallback\n"); + data->use_assembly_fallback = true; + } + } + + // Try assembly + if (!data->use_assembly_fallback) { + bool is_run_good = true; + Ceed_Hip *Hip_data; + CeedInt num_elem, num_input_fields, num_output_fields; + CeedEvalMode eval_mode; + CeedScalar *assembled_array; + CeedQFunctionField *qf_input_fields, *qf_output_fields; + CeedQFunction_Hip_gen *qf_data; + CeedQFunction qf; + CeedOperatorField *op_input_fields, *op_output_fields; + + CeedCallBackend(CeedGetData(ceed, &Hip_data)); + CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); + CeedCallBackend(CeedQFunctionGetData(qf, &qf_data)); + CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem)); + CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); + CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); + CeedDebug(ceed, "Running single operator assemble for /gpu/hip/gen\n"); + + // Input vectors + for (CeedInt i = 0; i < num_input_fields; i++) { + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); + if (eval_mode == CEED_EVAL_WEIGHT) { // Skip + data->fields.inputs[i] = NULL; + } else { + bool is_active; + CeedVector vec; + + // Get input vector + CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); + is_active = vec == CEED_VECTOR_ACTIVE; + if (is_active) data->fields.inputs[i] = NULL; + else CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->fields.inputs[i])); + CeedCallBackend(CeedVectorDestroy(&vec)); + } + } + + // Point coordinates + { + CeedVector vec; + + CeedCallBackend(CeedOperatorAtPointsGetPoints(op, NULL, &vec)); + CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->points.coords)); + CeedCallBackend(CeedVectorDestroy(&vec)); + + // Points per elem + if (num_elem != data->points.num_elem) { + CeedInt *points_per_elem; + const CeedInt num_bytes = num_elem * sizeof(CeedInt); + CeedElemRestriction rstr_points = NULL; + + data->points.num_elem = num_elem; + CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL)); + CeedCallBackend(CeedCalloc(num_elem, &points_per_elem)); + for (CeedInt e = 0; e < num_elem; e++) { + CeedInt num_points_elem; + + CeedCallBackend(CeedElemRestrictionGetNumPointsInElement(rstr_points, e, &num_points_elem)); + points_per_elem[e] = num_points_elem; + } + if (data->points.num_per_elem) CeedCallHip(ceed, hipFree((void **)data->points.num_per_elem)); + CeedCallHip(ceed, hipMalloc((void **)&data->points.num_per_elem, num_bytes)); + CeedCallHip(ceed, hipMemcpy((void *)data->points.num_per_elem, points_per_elem, num_bytes, hipMemcpyHostToDevice)); + CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points)); + CeedCallBackend(CeedFree(&points_per_elem)); + } + } + + // Get context data + CeedCallBackend(CeedQFunctionGetInnerContextData(qf, CEED_MEM_DEVICE, &qf_data->d_c)); + + // Assembly array + CeedCallBackend(CeedVectorGetArray(assembled, CEED_MEM_DEVICE, &assembled_array)); + CeedScalar *assembled_offset_array = &assembled_array[offset]; + + // Assemble diagonal + void *opargs[] = {(void *)&num_elem, &qf_data->d_c, &data->indices, &data->fields, &data->B, + &data->G, &data->W, &data->points, &assembled_offset_array}; + + CeedInt block_sizes[3] = {data->thread_1d, (data->dim == 1 ? 1 : data->thread_1d), -1}; + + CeedCallBackend(BlockGridCalculate_Hip_gen(data->dim, num_elem, data->max_P_1d, data->Q_1d, block_sizes)); + block_sizes[2] = 1; + if (data->dim == 1) { + CeedInt grid = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0); + CeedInt sharedMem = block_sizes[2] * data->thread_1d * sizeof(CeedScalar); + + CeedCallBackend(CeedTryRunKernelDimShared_Hip(ceed, data->assemble_full, NULL, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem, + &is_run_good, opargs)); + } else if (data->dim == 2) { + CeedInt grid = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0); + CeedInt sharedMem = block_sizes[2] * data->thread_1d * data->thread_1d * sizeof(CeedScalar); + + CeedCallBackend(CeedTryRunKernelDimShared_Hip(ceed, data->assemble_full, NULL, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem, + &is_run_good, opargs)); + } else if (data->dim == 3) { + CeedInt grid = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0); + CeedInt sharedMem = block_sizes[2] * data->thread_1d * data->thread_1d * sizeof(CeedScalar); + + CeedCallBackend(CeedTryRunKernelDimShared_Hip(ceed, data->assemble_full, NULL, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem, + &is_run_good, opargs)); + } + CeedCallHip(ceed, hipDeviceSynchronize()); + + // Restore input arrays + for (CeedInt i = 0; i < num_input_fields; i++) { + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); + if (eval_mode == CEED_EVAL_WEIGHT) { // Skip + } else { + bool is_active; + CeedVector vec; + + CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); + is_active = vec == CEED_VECTOR_ACTIVE; + if (!is_active) CeedCallBackend(CeedVectorRestoreArrayRead(vec, &data->fields.inputs[i])); + CeedCallBackend(CeedVectorDestroy(&vec)); + } + } + + // Restore point coordinates + { + CeedVector vec; + + CeedCallBackend(CeedOperatorAtPointsGetPoints(op, NULL, &vec)); + CeedCallBackend(CeedVectorRestoreArrayRead(vec, &data->points.coords)); + CeedCallBackend(CeedVectorDestroy(&vec)); + } + + // Restore context data + CeedCallBackend(CeedQFunctionRestoreInnerContextData(qf, &qf_data->d_c)); + + // Restore assembly array + CeedCallBackend(CeedVectorRestoreArray(assembled, &assembled_array)); + + // Cleanup + CeedCallBackend(CeedQFunctionDestroy(&qf)); + if (!is_run_good) { + CeedDebug(ceed, "Single Operator Assemble at Points run failed, using fallback\n"); + data->use_assembly_fallback = true; + } + } + CeedCallBackend(CeedDestroy(&ceed)); + + // Fallback, if needed + if (data->use_assembly_fallback) { + CeedOperator op_fallback; + + CeedDebug(CeedOperatorReturnCeed(op), "\nFalling back to /gpu/hip/ref CeedOperator for AtPoints SingleOperatorAssemble\n"); + CeedCallBackend(CeedOperatorGetFallback(op, &op_fallback)); + CeedCallBackend(CeedOperatorAssembleSingle(op_fallback, offset, assembled)); + return CEED_ERROR_SUCCESS; + } return CEED_ERROR_SUCCESS; } @@ -184,14 +875,30 @@ static int CeedOperatorApplyAdd_Hip_gen(CeedOperator op, CeedVector input_vec, C // Create operator //------------------------------------------------------------------------------ int CeedOperatorCreate_Hip_gen(CeedOperator op) { + bool is_composite, is_at_points; Ceed ceed; CeedOperator_Hip_gen *impl; CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); CeedCallBackend(CeedCalloc(1, &impl)); CeedCallBackend(CeedOperatorSetData(op, impl)); - CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "ApplyAdd", CeedOperatorApplyAdd_Hip_gen)); + CeedCall(CeedOperatorIsComposite(op, &is_composite)); + if (is_composite) { + CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "ApplyAddComposite", CeedOperatorApplyAddComposite_Hip_gen)); + } else { + CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "ApplyAdd", CeedOperatorApplyAdd_Hip_gen)); + } + CeedCall(CeedOperatorIsAtPoints(op, &is_at_points)); + if (is_at_points) { + CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleAddDiagonal", CeedOperatorLinearAssembleAddDiagonalAtPoints_Hip_gen)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleSingle", CeedOperatorAssembleSingleAtPoints_Hip_gen)); + } + if (!is_at_points) { + CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunction", CeedOperatorLinearAssembleQFunction_Hip_gen)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunctionUpdate", CeedOperatorLinearAssembleQFunctionUpdate_Hip_gen)); + } CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "Destroy", CeedOperatorDestroy_Hip_gen)); + CeedCallBackend(CeedDestroy(&ceed)); return CEED_ERROR_SUCCESS; } diff --git a/backends/hip-gen/ceed-hip-gen-qfunction.c b/backends/hip-gen/ceed-hip-gen-qfunction.c index ed10d81ad3..872f312594 100644 --- a/backends/hip-gen/ceed-hip-gen-qfunction.c +++ b/backends/hip-gen/ceed-hip-gen-qfunction.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -27,7 +27,6 @@ static int CeedQFunctionDestroy_Hip_gen(CeedQFunction qf) { CeedCallBackend(CeedQFunctionGetData(qf, &data)); CeedCallHip(CeedQFunctionReturnCeed(qf), hipFree(data->d_c)); - CeedCallBackend(CeedFree(&data->qfunction_source)); CeedCallBackend(CeedFree(&data)); return CEED_ERROR_SUCCESS; } @@ -43,15 +42,11 @@ int CeedQFunctionCreate_Hip_gen(CeedQFunction qf) { CeedCallBackend(CeedCalloc(1, &data)); CeedCallBackend(CeedQFunctionSetData(qf, data)); - // Read QFunction source CeedCallBackend(CeedQFunctionGetKernelName(qf, &data->qfunction_name)); - CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading QFunction User Source -----\n"); - CeedCallBackend(CeedQFunctionLoadSourceToBuffer(qf, &data->qfunction_source)); - CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading QFunction User Source Complete! -----\n"); - CeedCheck(data->qfunction_source, ceed, CEED_ERROR_UNSUPPORTED, "/gpu/hip/gen backend requires QFunction source code file"); CeedCallBackend(CeedSetBackendFunction(ceed, "QFunction", qf, "Apply", CeedQFunctionApply_Hip_gen)); CeedCallBackend(CeedSetBackendFunction(ceed, "QFunction", qf, "Destroy", CeedQFunctionDestroy_Hip_gen)); + CeedCallBackend(CeedDestroy(&ceed)); return CEED_ERROR_SUCCESS; } diff --git a/backends/hip-gen/ceed-hip-gen.c b/backends/hip-gen/ceed-hip-gen.c index d66ceb041a..8b3ead0db7 100644 --- a/backends/hip-gen/ceed-hip-gen.c +++ b/backends/hip-gen/ceed-hip-gen.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -17,10 +17,9 @@ // Backend init //------------------------------------------------------------------------------ static int CeedInit_Hip_gen(const char *resource, Ceed ceed) { - char *resource_root; - const char fallback_resource[] = "/gpu/hip/ref"; - Ceed ceed_shared; - Ceed_Hip *data; + char *resource_root; + Ceed ceed_shared, ceed_ref; + Ceed_Hip *data; CeedCallBackend(CeedGetResourceRoot(ceed, resource, ":", &resource_root)); CeedCheck(!strcmp(resource_root, "/gpu/hip") || !strcmp(resource_root, "/gpu/hip/gen"), ceed, CEED_ERROR_BACKEND, @@ -33,11 +32,16 @@ static int CeedInit_Hip_gen(const char *resource, Ceed ceed) { CeedCallBackend(CeedInit("/gpu/hip/shared", &ceed_shared)); CeedCallBackend(CeedSetDelegate(ceed, ceed_shared)); + CeedCallBackend(CeedDestroy(&ceed_shared)); - CeedCallBackend(CeedSetOperatorFallbackResource(ceed, fallback_resource)); + CeedCallBackend(CeedInit("/gpu/hip/ref", &ceed_ref)); + CeedCallBackend(CeedSetOperatorFallbackCeed(ceed, ceed_ref)); + CeedCallBackend(CeedDestroy(&ceed_ref)); CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "QFunctionCreate", CeedQFunctionCreate_Hip_gen)); CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "OperatorCreate", CeedOperatorCreate_Hip_gen)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "CompositeOperatorCreate", CeedOperatorCreate_Hip_gen)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "OperatorCreateAtPoints", CeedOperatorCreate_Hip_gen)); CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "Destroy", CeedDestroy_Hip)); return CEED_ERROR_SUCCESS; } diff --git a/backends/hip-gen/ceed-hip-gen.h b/backends/hip-gen/ceed-hip-gen.h index a0a8ac5511..1590f217f2 100644 --- a/backends/hip-gen/ceed-hip-gen.h +++ b/backends/hip-gen/ceed-hip-gen.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -12,21 +12,24 @@ #include typedef struct { + bool use_fallback, use_assembly_fallback; CeedInt dim; - CeedInt Q_1d; + CeedInt Q, Q_1d; CeedInt max_P_1d; - hipModule_t module; - hipFunction_t op; + CeedInt thread_1d; + hipStream_t streams[CEED_COMPOSITE_MAX]; + hipModule_t module, module_assemble_full, module_assemble_diagonal, module_assemble_qfunction; + hipFunction_t op, assemble_full, assemble_diagonal, assemble_qfunction; FieldsInt_Hip indices; Fields_Hip fields; Fields_Hip B; Fields_Hip G; CeedScalar *W; + Points_Hip points; } CeedOperator_Hip_gen; typedef struct { const char *qfunction_name; - const char *qfunction_source; void *d_c; } CeedQFunction_Hip_gen; diff --git a/backends/hip-ref/ceed-hip-ref-basis.c b/backends/hip-ref/ceed-hip-ref-basis.c index 3163018669..a05bba5006 100644 --- a/backends/hip-ref/ceed-hip-ref-basis.c +++ b/backends/hip-ref/ceed-hip-ref-basis.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -8,6 +8,7 @@ #include #include #include +#include #include #include "../hip/ceed-hip-common.h" @@ -17,7 +18,8 @@ //------------------------------------------------------------------------------ // Basis apply - tensor //------------------------------------------------------------------------------ -int CeedBasisApply_Hip(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, CeedVector v) { +static int CeedBasisApplyCore_Hip(CeedBasis basis, bool apply_add, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, + CeedVector u, CeedVector v) { Ceed ceed; CeedInt Q_1d, dim; const CeedInt is_transpose = t_mode == CEED_TRANSPOSE; @@ -32,15 +34,14 @@ int CeedBasisApply_Hip(CeedBasis basis, const CeedInt num_elem, CeedTransposeMod // Get read/write access to u, v if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); - CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); - - // Clear v for transpose operation - if (is_transpose) { - CeedSize length; - - CeedCallBackend(CeedVectorGetLength(v, &length)); - CeedCallHip(ceed, hipMemset(d_v, 0, length * sizeof(CeedScalar))); + if (apply_add) { + CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v)); + } else { + // Clear v for transpose operation + if (is_transpose) CeedCallBackend(CeedVectorSetValue(v, 0.0)); + CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); } + CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); CeedCallBackend(CeedBasisGetDimension(basis, &dim)); @@ -59,6 +60,7 @@ int CeedBasisApply_Hip(CeedBasis basis, const CeedInt num_elem, CeedTransposeMod CeedCallBackend(CeedRunKernel_Hip(ceed, data->Grad, num_elem, block_size, grad_args)); } break; case CEED_EVAL_WEIGHT: { + CeedCheck(data->d_q_weight_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weights_1d not set", CeedEvalModes[eval_mode]); void *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight_1d, &d_v}; const int block_size_x = Q_1d; const int block_size_y = dim >= 2 ? Q_1d : 1; @@ -78,14 +80,179 @@ int CeedBasisApply_Hip(CeedBasis basis, const CeedInt num_elem, CeedTransposeMod CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u)); if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); + CeedCallBackend(CeedDestroy(&ceed)); + return CEED_ERROR_SUCCESS; +} + +static int CeedBasisApply_Hip(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, CeedVector v) { + CeedCallBackend(CeedBasisApplyCore_Hip(basis, false, num_elem, t_mode, eval_mode, u, v)); + return CEED_ERROR_SUCCESS; +} + +static int CeedBasisApplyAdd_Hip(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, + CeedVector v) { + CeedCallBackend(CeedBasisApplyCore_Hip(basis, true, num_elem, t_mode, eval_mode, u, v)); + return CEED_ERROR_SUCCESS; +} + +//------------------------------------------------------------------------------ +// Basis apply - tensor AtPoints +//------------------------------------------------------------------------------ +static int CeedBasisApplyAtPointsCore_Hip(CeedBasis basis, bool apply_add, const CeedInt num_elem, const CeedInt *num_points, + CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) { + Ceed ceed; + CeedInt Q_1d, dim, max_num_points = num_points[0]; + const CeedInt is_transpose = t_mode == CEED_TRANSPOSE; + const int max_block_size = 32; + const CeedScalar *d_x, *d_u; + CeedScalar *d_v; + CeedBasis_Hip *data; + + CeedCallBackend(CeedBasisGetData(basis, &data)); + CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); + CeedCallBackend(CeedBasisGetDimension(basis, &dim)); + + // Weight handled separately + if (eval_mode == CEED_EVAL_WEIGHT) { + CeedCallBackend(CeedVectorSetValue(v, 1.0)); + return CEED_ERROR_SUCCESS; + } + + CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); + + // Check padded to uniform number of points per elem + for (CeedInt i = 1; i < num_elem; i++) max_num_points = CeedIntMax(max_num_points, num_points[i]); + { + CeedInt num_comp, q_comp; + CeedSize len, len_required; + + CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); + CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, eval_mode, &q_comp)); + CeedCallBackend(CeedVectorGetLength(is_transpose ? u : v, &len)); + len_required = (CeedSize)num_comp * (CeedSize)q_comp * (CeedSize)num_elem * (CeedSize)max_num_points; + CeedCheck(len >= len_required, ceed, CEED_ERROR_BACKEND, + "Vector at points must be padded to the same number of points in each element for BasisApplyAtPoints on GPU backends." + " Found %" CeedSize_FMT ", Required %" CeedSize_FMT, + len, len_required); + } + + // Move num_points array to device + if (is_transpose) { + const CeedInt num_bytes = num_elem * sizeof(CeedInt); + + if (num_elem != data->num_elem_at_points) { + data->num_elem_at_points = num_elem; + + if (data->d_points_per_elem) CeedCallHip(ceed, hipFree(data->d_points_per_elem)); + CeedCallHip(ceed, hipMalloc((void **)&data->d_points_per_elem, num_bytes)); + CeedCallBackend(CeedFree(&data->h_points_per_elem)); + CeedCallBackend(CeedCalloc(num_elem, &data->h_points_per_elem)); + } + if (memcmp(data->h_points_per_elem, num_points, num_bytes)) { + memcpy(data->h_points_per_elem, num_points, num_bytes); + CeedCallHip(ceed, hipMemcpy(data->d_points_per_elem, num_points, num_bytes, hipMemcpyHostToDevice)); + } + } + + // Build kernels if needed + if (data->num_points != max_num_points) { + CeedInt P_1d; + + CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); + data->num_points = max_num_points; + + // -- Create interp matrix to Chebyshev coefficients + if (!data->d_chebyshev_interp_1d) { + CeedSize interp_bytes; + CeedScalar *chebyshev_interp_1d; + + interp_bytes = P_1d * Q_1d * sizeof(CeedScalar); + CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d)); + CeedCallBackend(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d)); + CeedCallHip(ceed, hipMalloc((void **)&data->d_chebyshev_interp_1d, interp_bytes)); + CeedCallHip(ceed, hipMemcpy(data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, hipMemcpyHostToDevice)); + CeedCallBackend(CeedFree(&chebyshev_interp_1d)); + } + + // -- Compile kernels + const char basis_kernel_source[] = "// AtPoints basis source\n#include \n"; + CeedInt num_comp; + + if (data->moduleAtPoints) CeedCallHip(ceed, hipModuleUnload(data->moduleAtPoints)); + CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); + CeedCallBackend(CeedCompile_Hip(ceed, basis_kernel_source, &data->moduleAtPoints, 9, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "BASIS_BUF_LEN", + Q_1d * CeedIntPow(Q_1d > P_1d ? Q_1d : P_1d, dim - 1), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp, + "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim), "BASIS_NUM_PTS", + max_num_points, "POINTS_BUFF_LEN", CeedIntPow(Q_1d, dim - 1))); + CeedCallBackend(CeedGetKernel_Hip(ceed, data->moduleAtPoints, "InterpAtPoints", &data->InterpAtPoints)); + CeedCallBackend(CeedGetKernel_Hip(ceed, data->moduleAtPoints, "InterpTransposeAtPoints", &data->InterpTransposeAtPoints)); + CeedCallBackend(CeedGetKernel_Hip(ceed, data->moduleAtPoints, "GradAtPoints", &data->GradAtPoints)); + CeedCallBackend(CeedGetKernel_Hip(ceed, data->moduleAtPoints, "GradTransposeAtPoints", &data->GradTransposeAtPoints)); + } + + // Get read/write access to u, v + CeedCallBackend(CeedVectorGetArrayRead(x_ref, CEED_MEM_DEVICE, &d_x)); + if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); + else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); + if (apply_add) { + CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v)); + } else { + // Clear v for transpose operation + if (is_transpose) CeedCallBackend(CeedVectorSetValue(v, 0.0)); + CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); + } + + // Basis action + switch (eval_mode) { + case CEED_EVAL_INTERP: { + void *interp_args[] = {(void *)&num_elem, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v}; + const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size); + + CeedCallBackend(CeedRunKernel_Hip(ceed, is_transpose ? data->InterpTransposeAtPoints : data->InterpAtPoints, num_elem, block_size, + interp_args)); + } break; + case CEED_EVAL_GRAD: { + void *grad_args[] = {(void *)&num_elem, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v}; + const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size); + + CeedCallBackend(CeedRunKernel_Hip(ceed, is_transpose ? data->GradTransposeAtPoints : data->GradAtPoints, num_elem, block_size, grad_args)); + } break; + case CEED_EVAL_WEIGHT: + case CEED_EVAL_NONE: /* handled separately below */ + break; + // LCOV_EXCL_START + case CEED_EVAL_DIV: + case CEED_EVAL_CURL: + return CeedError(ceed, CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]); + // LCOV_EXCL_STOP + } + + // Restore vectors, cover CEED_EVAL_NONE + CeedCallBackend(CeedVectorRestoreArrayRead(x_ref, &d_x)); + CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); + if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u)); + if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); + CeedCallBackend(CeedDestroy(&ceed)); + return CEED_ERROR_SUCCESS; +} + +static int CeedBasisApplyAtPoints_Hip(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, + CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) { + CeedCallBackend(CeedBasisApplyAtPointsCore_Hip(basis, false, num_elem, num_points, t_mode, eval_mode, x_ref, u, v)); + return CEED_ERROR_SUCCESS; +} + +static int CeedBasisApplyAddAtPoints_Hip(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, + CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) { + CeedCallBackend(CeedBasisApplyAtPointsCore_Hip(basis, true, num_elem, num_points, t_mode, eval_mode, x_ref, u, v)); return CEED_ERROR_SUCCESS; } //------------------------------------------------------------------------------ // Basis apply - non-tensor //------------------------------------------------------------------------------ -int CeedBasisApplyNonTensor_Hip(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, - CeedVector v) { +static int CeedBasisApplyNonTensorCore_Hip(CeedBasis basis, bool apply_add, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, + CeedVector u, CeedVector v) { Ceed ceed; CeedInt num_nodes, num_qpts; const CeedInt is_transpose = t_mode == CEED_TRANSPOSE; @@ -103,14 +270,12 @@ int CeedBasisApplyNonTensor_Hip(CeedBasis basis, const CeedInt num_elem, CeedTra // Get read/write access to u, v if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); - CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); - - // Clear v for transpose operation - if (is_transpose) { - CeedSize length; - - CeedCallBackend(CeedVectorGetLength(v, &length)); - CeedCallHip(ceed, hipMemset(d_v, 0, length * sizeof(CeedScalar))); + if (apply_add) { + CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v)); + } else { + // Clear v for transpose operation + if (is_transpose) CeedCallBackend(CeedVectorSetValue(v, 0.0)); + CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); } // Apply basis operation @@ -156,6 +321,7 @@ int CeedBasisApplyNonTensor_Hip(CeedBasis basis, const CeedInt num_elem, CeedTra } } break; case CEED_EVAL_WEIGHT: { + CeedCheck(data->d_q_weight, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weights not set", CeedEvalModes[eval_mode]); void *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight, &d_v}; CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->Weight, grid, num_qpts, 1, elems_per_block, weight_args)); @@ -168,6 +334,19 @@ int CeedBasisApplyNonTensor_Hip(CeedBasis basis, const CeedInt num_elem, CeedTra CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u)); if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); + CeedCallBackend(CeedDestroy(&ceed)); + return CEED_ERROR_SUCCESS; +} + +static int CeedBasisApplyNonTensor_Hip(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, + CeedVector v) { + CeedCallBackend(CeedBasisApplyNonTensorCore_Hip(basis, false, num_elem, t_mode, eval_mode, u, v)); + return CEED_ERROR_SUCCESS; +} + +static int CeedBasisApplyAddNonTensor_Hip(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, + CeedVector v) { + CeedCallBackend(CeedBasisApplyNonTensorCore_Hip(basis, true, num_elem, t_mode, eval_mode, u, v)); return CEED_ERROR_SUCCESS; } @@ -181,10 +360,15 @@ static int CeedBasisDestroy_Hip(CeedBasis basis) { CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); CeedCallBackend(CeedBasisGetData(basis, &data)); CeedCallHip(ceed, hipModuleUnload(data->module)); - CeedCallHip(ceed, hipFree(data->d_q_weight_1d)); + if (data->moduleAtPoints) CeedCallHip(ceed, hipModuleUnload(data->moduleAtPoints)); + if (data->d_q_weight_1d) CeedCallHip(ceed, hipFree(data->d_q_weight_1d)); + CeedCallBackend(CeedFree(&data->h_points_per_elem)); + if (data->d_points_per_elem) CeedCallHip(ceed, hipFree(data->d_points_per_elem)); CeedCallHip(ceed, hipFree(data->d_interp_1d)); CeedCallHip(ceed, hipFree(data->d_grad_1d)); + CeedCallHip(ceed, hipFree(data->d_chebyshev_interp_1d)); CeedCallBackend(CeedFree(&data)); + CeedCallBackend(CeedDestroy(&ceed)); return CEED_ERROR_SUCCESS; } @@ -198,12 +382,13 @@ static int CeedBasisDestroyNonTensor_Hip(CeedBasis basis) { CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); CeedCallBackend(CeedBasisGetData(basis, &data)); CeedCallHip(ceed, hipModuleUnload(data->module)); - CeedCallHip(ceed, hipFree(data->d_q_weight)); + if (data->d_q_weight) CeedCallHip(ceed, hipFree(data->d_q_weight)); CeedCallHip(ceed, hipFree(data->d_interp)); CeedCallHip(ceed, hipFree(data->d_grad)); CeedCallHip(ceed, hipFree(data->d_div)); CeedCallHip(ceed, hipFree(data->d_curl)); CeedCallBackend(CeedFree(&data)); + CeedCallBackend(CeedDestroy(&ceed)); return CEED_ERROR_SUCCESS; } @@ -213,8 +398,6 @@ static int CeedBasisDestroyNonTensor_Hip(CeedBasis basis) { int CeedBasisCreateTensorH1_Hip(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d, const CeedScalar *q_ref_1d, const CeedScalar *q_weight_1d, CeedBasis basis) { Ceed ceed; - char *basis_kernel_source; - const char *basis_kernel_path; CeedInt num_comp; const CeedInt q_bytes = Q_1d * sizeof(CeedScalar); const CeedInt interp_bytes = q_bytes * P_1d; @@ -224,33 +407,35 @@ int CeedBasisCreateTensorH1_Hip(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const C CeedCallBackend(CeedCalloc(1, &data)); // Copy data to GPU - CeedCallHip(ceed, hipMalloc((void **)&data->d_q_weight_1d, q_bytes)); - CeedCallHip(ceed, hipMemcpy(data->d_q_weight_1d, q_weight_1d, q_bytes, hipMemcpyHostToDevice)); + if (q_weight_1d) { + CeedCallHip(ceed, hipMalloc((void **)&data->d_q_weight_1d, q_bytes)); + CeedCallHip(ceed, hipMemcpy(data->d_q_weight_1d, q_weight_1d, q_bytes, hipMemcpyHostToDevice)); + } CeedCallHip(ceed, hipMalloc((void **)&data->d_interp_1d, interp_bytes)); CeedCallHip(ceed, hipMemcpy(data->d_interp_1d, interp_1d, interp_bytes, hipMemcpyHostToDevice)); CeedCallHip(ceed, hipMalloc((void **)&data->d_grad_1d, interp_bytes)); CeedCallHip(ceed, hipMemcpy(data->d_grad_1d, grad_1d, interp_bytes, hipMemcpyHostToDevice)); // Compile basis kernels + const char basis_kernel_source[] = "// Tensor basis source\n#include \n"; + CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); - CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-ref-basis-tensor.h", &basis_kernel_path)); - CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n"); - CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source)); - CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n"); CeedCallBackend(CeedCompile_Hip(ceed, basis_kernel_source, &data->module, 7, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "BASIS_BUF_LEN", - num_comp * CeedIntPow(Q_1d > P_1d ? Q_1d : P_1d, dim), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp, + Q_1d * CeedIntPow(Q_1d > P_1d ? Q_1d : P_1d, dim - 1), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp, "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim))); CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Interp", &data->Interp)); CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Grad", &data->Grad)); CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Weight", &data->Weight)); - CeedCallBackend(CeedFree(&basis_kernel_path)); - CeedCallBackend(CeedFree(&basis_kernel_source)); CeedCallBackend(CeedBasisSetData(basis, data)); // Register backend functions CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApply_Hip)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAdd_Hip)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAtPoints", CeedBasisApplyAtPoints_Hip)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAddAtPoints", CeedBasisApplyAddAtPoints_Hip)); CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Hip)); + CeedCallBackend(CeedDestroy(&ceed)); return CEED_ERROR_SUCCESS; } @@ -260,8 +445,6 @@ int CeedBasisCreateTensorH1_Hip(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const C int CeedBasisCreateH1_Hip(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, const CeedScalar *grad, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) { Ceed ceed; - char *basis_kernel_source; - const char *basis_kernel_path; CeedInt num_comp, q_comp_interp, q_comp_grad; const CeedInt q_bytes = num_qpts * sizeof(CeedScalar); CeedBasisNonTensor_Hip *data; @@ -272,8 +455,10 @@ int CeedBasisCreateH1_Hip(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, // Copy basis data to GPU CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp)); CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_GRAD, &q_comp_grad)); - CeedCallHip(ceed, hipMalloc((void **)&data->d_q_weight, q_bytes)); - CeedCallHip(ceed, hipMemcpy(data->d_q_weight, q_weight, q_bytes, hipMemcpyHostToDevice)); + if (q_weight) { + CeedCallHip(ceed, hipMalloc((void **)&data->d_q_weight, q_bytes)); + CeedCallHip(ceed, hipMemcpy(data->d_q_weight, q_weight, q_bytes, hipMemcpyHostToDevice)); + } if (interp) { const CeedInt interp_bytes = q_bytes * num_nodes * q_comp_interp; @@ -288,11 +473,9 @@ int CeedBasisCreateH1_Hip(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, } // Compile basis kernels + const char basis_kernel_source[] = "// Nontensor basis source\n#include \n"; + CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); - CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-ref-basis-nontensor.h", &basis_kernel_path)); - CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n"); - CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source)); - CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n"); CeedCallBackend(CeedCompile_Hip(ceed, basis_kernel_source, &data->module, 5, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "BASIS_Q_COMP_INTERP", q_comp_interp, "BASIS_Q_COMP_DERIV", q_comp_grad, "BASIS_NUM_COMP", num_comp)); CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Interp", &data->Interp)); @@ -300,14 +483,14 @@ int CeedBasisCreateH1_Hip(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Deriv", &data->Deriv)); CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "DerivTranspose", &data->DerivTranspose)); CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Weight", &data->Weight)); - CeedCallBackend(CeedFree(&basis_kernel_path)); - CeedCallBackend(CeedFree(&basis_kernel_source)); CeedCallBackend(CeedBasisSetData(basis, data)); // Register backend functions CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Hip)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Hip)); CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Hip)); + CeedCallBackend(CeedDestroy(&ceed)); return CEED_ERROR_SUCCESS; } @@ -317,8 +500,6 @@ int CeedBasisCreateH1_Hip(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, int CeedBasisCreateHdiv_Hip(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, const CeedScalar *div, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) { Ceed ceed; - char *basis_kernel_source; - const char *basis_kernel_path; CeedInt num_comp, q_comp_interp, q_comp_div; const CeedInt q_bytes = num_qpts * sizeof(CeedScalar); CeedBasisNonTensor_Hip *data; @@ -329,8 +510,10 @@ int CeedBasisCreateHdiv_Hip(CeedElemTopology topo, CeedInt dim, CeedInt num_node // Copy basis data to GPU CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp)); CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_DIV, &q_comp_div)); - CeedCallHip(ceed, hipMalloc((void **)&data->d_q_weight, q_bytes)); - CeedCallHip(ceed, hipMemcpy(data->d_q_weight, q_weight, q_bytes, hipMemcpyHostToDevice)); + if (q_weight) { + CeedCallHip(ceed, hipMalloc((void **)&data->d_q_weight, q_bytes)); + CeedCallHip(ceed, hipMemcpy(data->d_q_weight, q_weight, q_bytes, hipMemcpyHostToDevice)); + } if (interp) { const CeedInt interp_bytes = q_bytes * num_nodes * q_comp_interp; @@ -345,11 +528,9 @@ int CeedBasisCreateHdiv_Hip(CeedElemTopology topo, CeedInt dim, CeedInt num_node } // Compile basis kernels + const char basis_kernel_source[] = "// Nontensor basis source\n#include \n"; + CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); - CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-ref-basis-nontensor.h", &basis_kernel_path)); - CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n"); - CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source)); - CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n"); CeedCallBackend(CeedCompile_Hip(ceed, basis_kernel_source, &data->module, 5, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "BASIS_Q_COMP_INTERP", q_comp_interp, "BASIS_Q_COMP_DERIV", q_comp_div, "BASIS_NUM_COMP", num_comp)); CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Interp", &data->Interp)); @@ -357,14 +538,14 @@ int CeedBasisCreateHdiv_Hip(CeedElemTopology topo, CeedInt dim, CeedInt num_node CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Deriv", &data->Deriv)); CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "DerivTranspose", &data->DerivTranspose)); CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Weight", &data->Weight)); - CeedCallBackend(CeedFree(&basis_kernel_path)); - CeedCallBackend(CeedFree(&basis_kernel_source)); CeedCallBackend(CeedBasisSetData(basis, data)); // Register backend functions CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Hip)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Hip)); CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Hip)); + CeedCallBackend(CeedDestroy(&ceed)); return CEED_ERROR_SUCCESS; } @@ -374,8 +555,6 @@ int CeedBasisCreateHdiv_Hip(CeedElemTopology topo, CeedInt dim, CeedInt num_node int CeedBasisCreateHcurl_Hip(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, const CeedScalar *curl, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) { Ceed ceed; - char *basis_kernel_source; - const char *basis_kernel_path; CeedInt num_comp, q_comp_interp, q_comp_curl; const CeedInt q_bytes = num_qpts * sizeof(CeedScalar); CeedBasisNonTensor_Hip *data; @@ -386,8 +565,10 @@ int CeedBasisCreateHcurl_Hip(CeedElemTopology topo, CeedInt dim, CeedInt num_nod // Copy basis data to GPU CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp)); CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_CURL, &q_comp_curl)); - CeedCallHip(ceed, hipMalloc((void **)&data->d_q_weight, q_bytes)); - CeedCallHip(ceed, hipMemcpy(data->d_q_weight, q_weight, q_bytes, hipMemcpyHostToDevice)); + if (q_weight) { + CeedCallHip(ceed, hipMalloc((void **)&data->d_q_weight, q_bytes)); + CeedCallHip(ceed, hipMemcpy(data->d_q_weight, q_weight, q_bytes, hipMemcpyHostToDevice)); + } if (interp) { const CeedInt interp_bytes = q_bytes * num_nodes * q_comp_interp; @@ -402,11 +583,9 @@ int CeedBasisCreateHcurl_Hip(CeedElemTopology topo, CeedInt dim, CeedInt num_nod } // Compile basis kernels + const char basis_kernel_source[] = "// Nontensor basis source\n#include \n"; + CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); - CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-ref-basis-nontensor.h", &basis_kernel_path)); - CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n"); - CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source)); - CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n"); CeedCallBackend(CeedCompile_Hip(ceed, basis_kernel_source, &data->module, 5, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "BASIS_Q_COMP_INTERP", q_comp_interp, "BASIS_Q_COMP_DERIV", q_comp_curl, "BASIS_NUM_COMP", num_comp)); CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Interp", &data->Interp)); @@ -414,14 +593,14 @@ int CeedBasisCreateHcurl_Hip(CeedElemTopology topo, CeedInt dim, CeedInt num_nod CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Deriv", &data->Deriv)); CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "DerivTranspose", &data->DerivTranspose)); CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Weight", &data->Weight)); - CeedCallBackend(CeedFree(&basis_kernel_path)); - CeedCallBackend(CeedFree(&basis_kernel_source)); CeedCallBackend(CeedBasisSetData(basis, data)); // Register backend functions CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Hip)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Hip)); CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Hip)); + CeedCallBackend(CeedDestroy(&ceed)); return CEED_ERROR_SUCCESS; } diff --git a/backends/hip-ref/ceed-hip-ref-operator.c b/backends/hip-ref/ceed-hip-ref-operator.c index 486d9bc400..4ef3c76bdb 100644 --- a/backends/hip-ref/ceed-hip-ref-operator.c +++ b/backends/hip-ref/ceed-hip-ref-operator.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -26,20 +26,28 @@ static int CeedOperatorDestroy_Hip(CeedOperator op) { CeedCallBackend(CeedOperatorGetData(op, &impl)); // Apply data - for (CeedInt i = 0; i < impl->num_inputs + impl->num_outputs; i++) { - CeedCallBackend(CeedVectorDestroy(&impl->e_vecs[i])); - } - CeedCallBackend(CeedFree(&impl->e_vecs)); + CeedCallBackend(CeedFree(&impl->num_points)); + CeedCallBackend(CeedFree(&impl->skip_rstr_in)); + CeedCallBackend(CeedFree(&impl->skip_rstr_out)); + CeedCallBackend(CeedFree(&impl->apply_add_basis_out)); + CeedCallBackend(CeedFree(&impl->input_field_order)); + CeedCallBackend(CeedFree(&impl->output_field_order)); + CeedCallBackend(CeedFree(&impl->input_states)); for (CeedInt i = 0; i < impl->num_inputs; i++) { + CeedCallBackend(CeedVectorDestroy(&impl->e_vecs_in[i])); CeedCallBackend(CeedVectorDestroy(&impl->q_vecs_in[i])); } + CeedCallBackend(CeedFree(&impl->e_vecs_in)); CeedCallBackend(CeedFree(&impl->q_vecs_in)); for (CeedInt i = 0; i < impl->num_outputs; i++) { + CeedCallBackend(CeedVectorDestroy(&impl->e_vecs_out[i])); CeedCallBackend(CeedVectorDestroy(&impl->q_vecs_out[i])); } + CeedCallBackend(CeedFree(&impl->e_vecs_out)); CeedCallBackend(CeedFree(&impl->q_vecs_out)); + CeedCallBackend(CeedVectorDestroy(&impl->point_coords_elem)); // QFunction assembly data for (CeedInt i = 0; i < impl->num_active_in; i++) { @@ -69,10 +77,11 @@ static int CeedOperatorDestroy_Hip(CeedOperator op) { CeedCallHip(ceed, hipFree(impl->diag->d_div_out)); CeedCallHip(ceed, hipFree(impl->diag->d_curl_in)); CeedCallHip(ceed, hipFree(impl->diag->d_curl_out)); - CeedCallBackend(CeedElemRestrictionDestroy(&impl->diag->diag_rstr)); - CeedCallBackend(CeedElemRestrictionDestroy(&impl->diag->point_block_diag_rstr)); + CeedCallBackend(CeedDestroy(&ceed)); CeedCallBackend(CeedVectorDestroy(&impl->diag->elem_diag)); CeedCallBackend(CeedVectorDestroy(&impl->diag->point_block_elem_diag)); + CeedCallBackend(CeedElemRestrictionDestroy(&impl->diag->diag_rstr)); + CeedCallBackend(CeedElemRestrictionDestroy(&impl->diag->point_block_diag_rstr)); } CeedCallBackend(CeedFree(&impl->diag)); @@ -83,6 +92,7 @@ static int CeedOperatorDestroy_Hip(CeedOperator op) { CeedCallHip(ceed, hipModuleUnload(impl->asmb->module)); CeedCallHip(ceed, hipFree(impl->asmb->d_B_in)); CeedCallHip(ceed, hipFree(impl->asmb->d_B_out)); + CeedCallBackend(CeedDestroy(&ceed)); } CeedCallBackend(CeedFree(&impl->asmb)); @@ -93,8 +103,8 @@ static int CeedOperatorDestroy_Hip(CeedOperator op) { //------------------------------------------------------------------------------ // Setup infields or outfields //------------------------------------------------------------------------------ -static int CeedOperatorSetupFields_Hip(CeedQFunction qf, CeedOperator op, bool is_input, CeedVector *e_vecs, CeedVector *q_vecs, CeedInt start_e, - CeedInt num_fields, CeedInt Q, CeedInt num_elem) { +static int CeedOperatorSetupFields_Hip(CeedQFunction qf, CeedOperator op, bool is_input, bool is_at_points, bool *skip_rstr, bool *apply_add_basis, + CeedVector *e_vecs, CeedVector *q_vecs, CeedInt num_fields, CeedInt Q, CeedInt num_elem) { Ceed ceed; CeedQFunctionField *qf_fields; CeedOperatorField *op_fields; @@ -110,68 +120,115 @@ static int CeedOperatorSetupFields_Hip(CeedQFunction qf, CeedOperator op, bool i // Loop over fields for (CeedInt i = 0; i < num_fields; i++) { - bool is_strided = false, skip_restriction = false; - CeedSize q_size; - CeedInt size; - CeedEvalMode eval_mode; - CeedBasis basis; + bool is_active = false, is_strided = false, skip_e_vec = false; + CeedSize q_size; + CeedInt size; + CeedEvalMode eval_mode; + CeedVector l_vec; + CeedElemRestriction elem_rstr; + // Check whether this field can skip the element restriction: + // Input CEED_VECTOR_ACTIVE + // Output CEED_VECTOR_ACTIVE without CEED_EVAL_NONE + // Input CEED_VECTOR_NONE with CEED_EVAL_WEIGHT + // Input passive vector with CEED_EVAL_NONE and strided restriction with CEED_STRIDES_BACKEND + CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &l_vec)); + is_active = l_vec == CEED_VECTOR_ACTIVE; + CeedCallBackend(CeedVectorDestroy(&l_vec)); + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &elem_rstr)); CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode)); - if (eval_mode != CEED_EVAL_WEIGHT) { - CeedElemRestriction elem_rstr; - - // Check whether this field can skip the element restriction: - // Must be passive input, with eval_mode NONE, and have a strided restriction with CEED_STRIDES_BACKEND. - CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &elem_rstr)); - - // First, check whether the field is input or output: - if (is_input) { - CeedVector vec; - - // Check for passive input - CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec)); - if (vec != CEED_VECTOR_ACTIVE) { - // Check eval_mode - if (eval_mode == CEED_EVAL_NONE) { - // Check for strided restriction - CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided)); - if (is_strided) { - // Check if vector is already in preferred backend ordering - CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &skip_restriction)); - } - } - } - } - if (skip_restriction) { - // We do not need an E-Vector, but will use the input field vector's data directly in the operator application. - e_vecs[i + start_e] = NULL; - } else { - CeedCallBackend(CeedElemRestrictionCreateVector(elem_rstr, NULL, &e_vecs[i + start_e])); - } + skip_e_vec = (is_input && is_active) || (is_active && eval_mode != CEED_EVAL_NONE) || (eval_mode == CEED_EVAL_WEIGHT); + if (!skip_e_vec && is_input && !is_active && eval_mode == CEED_EVAL_NONE) { + CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided)); + if (is_strided) CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &skip_e_vec)); } + if (skip_e_vec) { + e_vecs[i] = NULL; + } else { + CeedCallBackend(CeedElemRestrictionCreateVector(elem_rstr, NULL, &e_vecs[i])); + } + CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); switch (eval_mode) { case CEED_EVAL_NONE: - CeedCallBackend(CeedQFunctionFieldGetSize(qf_fields[i], &size)); - q_size = (CeedSize)num_elem * Q * size; - CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i])); - break; case CEED_EVAL_INTERP: case CEED_EVAL_GRAD: case CEED_EVAL_DIV: case CEED_EVAL_CURL: CeedCallBackend(CeedQFunctionFieldGetSize(qf_fields[i], &size)); - q_size = (CeedSize)num_elem * Q * size; + q_size = (CeedSize)num_elem * (CeedSize)Q * (CeedSize)size; CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i])); break; - case CEED_EVAL_WEIGHT: // Only on input fields + case CEED_EVAL_WEIGHT: { + CeedBasis basis; + CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis)); - q_size = (CeedSize)num_elem * Q; + q_size = (CeedSize)num_elem * (CeedSize)Q; CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i])); - CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_NOTRANSPOSE, CEED_EVAL_WEIGHT, CEED_VECTOR_NONE, q_vecs[i])); + if (is_at_points) { + CeedInt num_points[num_elem]; + + for (CeedInt i = 0; i < num_elem; i++) num_points[i] = Q; + CeedCallBackend(CeedBasisApplyAtPoints(basis, num_elem, num_points, CEED_NOTRANSPOSE, CEED_EVAL_WEIGHT, CEED_VECTOR_NONE, CEED_VECTOR_NONE, + q_vecs[i])); + } else { + CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_NOTRANSPOSE, CEED_EVAL_WEIGHT, CEED_VECTOR_NONE, q_vecs[i])); + } + CeedCallBackend(CeedBasisDestroy(&basis)); break; + } } } + // Drop duplicate restrictions + if (is_input) { + for (CeedInt i = 0; i < num_fields; i++) { + CeedVector vec_i; + CeedElemRestriction rstr_i; + + CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec_i)); + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &rstr_i)); + for (CeedInt j = i + 1; j < num_fields; j++) { + CeedVector vec_j; + CeedElemRestriction rstr_j; + + CeedCallBackend(CeedOperatorFieldGetVector(op_fields[j], &vec_j)); + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[j], &rstr_j)); + if (vec_i == vec_j && rstr_i == rstr_j) { + if (e_vecs[i]) CeedCallBackend(CeedVectorReferenceCopy(e_vecs[i], &e_vecs[j])); + skip_rstr[j] = true; + } + CeedCallBackend(CeedVectorDestroy(&vec_j)); + CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j)); + } + CeedCallBackend(CeedVectorDestroy(&vec_i)); + CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i)); + } + } else { + for (CeedInt i = num_fields - 1; i >= 0; i--) { + CeedVector vec_i; + CeedElemRestriction rstr_i; + + CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec_i)); + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &rstr_i)); + for (CeedInt j = i - 1; j >= 0; j--) { + CeedVector vec_j; + CeedElemRestriction rstr_j; + + CeedCallBackend(CeedOperatorFieldGetVector(op_fields[j], &vec_j)); + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[j], &rstr_j)); + if (vec_i == vec_j && rstr_i == rstr_j) { + if (e_vecs[i]) CeedCallBackend(CeedVectorReferenceCopy(e_vecs[i], &e_vecs[j])); + skip_rstr[j] = true; + apply_add_basis[i] = true; + } + CeedCallBackend(CeedVectorDestroy(&vec_j)); + CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j)); + } + CeedCallBackend(CeedVectorDestroy(&vec_i)); + CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i)); + } + } + CeedCallBackend(CeedDestroy(&ceed)); return CEED_ERROR_SUCCESS; } @@ -179,7 +236,6 @@ static int CeedOperatorSetupFields_Hip(CeedQFunction qf, CeedOperator op, bool i // CeedOperator needs to connect all the named fields (be they active or passive) to the named inputs and outputs of its CeedQFunction. //------------------------------------------------------------------------------ static int CeedOperatorSetup_Hip(CeedOperator op) { - Ceed ceed; bool is_setup_done; CeedInt Q, num_elem, num_input_fields, num_output_fields; CeedQFunctionField *qf_input_fields, *qf_output_fields; @@ -190,7 +246,6 @@ static int CeedOperatorSetup_Hip(CeedOperator op) { CeedCallBackend(CeedOperatorIsSetupDone(op, &is_setup_done)); if (is_setup_done) return CEED_ERROR_SUCCESS; - CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); CeedCallBackend(CeedOperatorGetData(op, &impl)); CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q)); @@ -199,159 +254,623 @@ static int CeedOperatorSetup_Hip(CeedOperator op) { CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); // Allocate - CeedCallBackend(CeedCalloc(num_input_fields + num_output_fields, &impl->e_vecs)); - CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->q_vecs_in)); - CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->q_vecs_out)); + CeedCallBackend(CeedCalloc(num_input_fields, &impl->e_vecs_in)); + CeedCallBackend(CeedCalloc(num_output_fields, &impl->e_vecs_out)); + CeedCallBackend(CeedCalloc(num_input_fields, &impl->skip_rstr_in)); + CeedCallBackend(CeedCalloc(num_output_fields, &impl->skip_rstr_out)); + CeedCallBackend(CeedCalloc(num_output_fields, &impl->apply_add_basis_out)); + CeedCallBackend(CeedCalloc(num_input_fields, &impl->input_field_order)); + CeedCallBackend(CeedCalloc(num_output_fields, &impl->output_field_order)); + CeedCallBackend(CeedCalloc(num_input_fields, &impl->input_states)); + CeedCallBackend(CeedCalloc(num_input_fields, &impl->q_vecs_in)); + CeedCallBackend(CeedCalloc(num_output_fields, &impl->q_vecs_out)); impl->num_inputs = num_input_fields; impl->num_outputs = num_output_fields; - // Set up infield and outfield e_vecs and q_vecs - // Infields - CeedCallBackend(CeedOperatorSetupFields_Hip(qf, op, true, impl->e_vecs, impl->q_vecs_in, 0, num_input_fields, Q, num_elem)); - // Outfields - CeedCallBackend(CeedOperatorSetupFields_Hip(qf, op, false, impl->e_vecs, impl->q_vecs_out, num_input_fields, num_output_fields, Q, num_elem)); + // Set up infield and outfield e-vecs and q-vecs + CeedCallBackend(CeedOperatorSetupFields_Hip(qf, op, true, false, impl->skip_rstr_in, NULL, impl->e_vecs_in, impl->q_vecs_in, num_input_fields, Q, + num_elem)); + CeedCallBackend(CeedOperatorSetupFields_Hip(qf, op, false, false, impl->skip_rstr_out, impl->apply_add_basis_out, impl->e_vecs_out, + impl->q_vecs_out, num_output_fields, Q, num_elem)); + // Reorder fields to allow reuse of buffers + impl->max_active_e_vec_len = 0; + { + bool is_ordered[CEED_FIELD_MAX]; + CeedInt curr_index = 0; + + for (CeedInt i = 0; i < num_input_fields; i++) is_ordered[i] = false; + for (CeedInt i = 0; i < num_input_fields; i++) { + CeedSize e_vec_len_i; + CeedVector vec_i; + CeedElemRestriction rstr_i; + + if (is_ordered[i]) continue; + is_ordered[i] = true; + impl->input_field_order[curr_index] = i; + curr_index++; + CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i)); + if (vec_i == CEED_VECTOR_NONE) { + // CEED_EVAL_WEIGHT + CeedCallBackend(CeedVectorDestroy(&vec_i)); + continue; + }; + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr_i)); + CeedCallBackend(CeedElemRestrictionGetEVectorSize(rstr_i, &e_vec_len_i)); + impl->max_active_e_vec_len = e_vec_len_i > impl->max_active_e_vec_len ? e_vec_len_i : impl->max_active_e_vec_len; + for (CeedInt j = i + 1; j < num_input_fields; j++) { + CeedVector vec_j; + CeedElemRestriction rstr_j; + + CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[j], &vec_j)); + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[j], &rstr_j)); + if (rstr_i == rstr_j && vec_i == vec_j) { + is_ordered[j] = true; + impl->input_field_order[curr_index] = j; + curr_index++; + } + CeedCallBackend(CeedVectorDestroy(&vec_j)); + CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j)); + } + CeedCallBackend(CeedVectorDestroy(&vec_i)); + CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i)); + } + } + { + bool is_ordered[CEED_FIELD_MAX]; + CeedInt curr_index = 0; + + for (CeedInt i = 0; i < num_output_fields; i++) is_ordered[i] = false; + for (CeedInt i = 0; i < num_output_fields; i++) { + CeedSize e_vec_len_i; + CeedVector vec_i; + CeedElemRestriction rstr_i; + + if (is_ordered[i]) continue; + is_ordered[i] = true; + impl->output_field_order[curr_index] = i; + curr_index++; + CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec_i)); + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &rstr_i)); + CeedCallBackend(CeedElemRestrictionGetEVectorSize(rstr_i, &e_vec_len_i)); + impl->max_active_e_vec_len = e_vec_len_i > impl->max_active_e_vec_len ? e_vec_len_i : impl->max_active_e_vec_len; + for (CeedInt j = i + 1; j < num_output_fields; j++) { + CeedVector vec_j; + CeedElemRestriction rstr_j; + + CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[j], &vec_j)); + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[j], &rstr_j)); + if (rstr_i == rstr_j && vec_i == vec_j) { + is_ordered[j] = true; + impl->output_field_order[curr_index] = j; + curr_index++; + } + CeedCallBackend(CeedVectorDestroy(&vec_j)); + CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j)); + } + CeedCallBackend(CeedVectorDestroy(&vec_i)); + CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i)); + } + } + CeedCallBackend(CeedClearWorkVectors(CeedOperatorReturnCeed(op), impl->max_active_e_vec_len)); + { + // Create two work vectors for diagonal assembly + CeedVector temp_1, temp_2; + + CeedCallBackend(CeedGetWorkVector(CeedOperatorReturnCeed(op), impl->max_active_e_vec_len, &temp_1)); + CeedCallBackend(CeedGetWorkVector(CeedOperatorReturnCeed(op), impl->max_active_e_vec_len, &temp_2)); + CeedCallBackend(CeedRestoreWorkVector(CeedOperatorReturnCeed(op), &temp_1)); + CeedCallBackend(CeedRestoreWorkVector(CeedOperatorReturnCeed(op), &temp_2)); + } CeedCallBackend(CeedOperatorSetSetupDone(op)); + CeedCallBackend(CeedQFunctionDestroy(&qf)); return CEED_ERROR_SUCCESS; } //------------------------------------------------------------------------------ -// Setup Operator Inputs +// Restrict Operator Inputs //------------------------------------------------------------------------------ -static inline int CeedOperatorSetupInputs_Hip(CeedInt num_input_fields, CeedQFunctionField *qf_input_fields, CeedOperatorField *op_input_fields, - CeedVector in_vec, const bool skip_active, CeedScalar *e_data[2 * CEED_FIELD_MAX], - CeedOperator_Hip *impl, CeedRequest *request) { - for (CeedInt i = 0; i < num_input_fields; i++) { - CeedEvalMode eval_mode; - CeedVector vec; - CeedElemRestriction elem_rstr; +static inline int CeedOperatorInputRestrict_Hip(CeedOperatorField op_input_field, CeedQFunctionField qf_input_field, CeedInt input_field, + CeedVector in_vec, CeedVector active_e_vec, const bool skip_active, CeedOperator_Hip *impl, + CeedRequest *request) { + bool is_active = false; + CeedVector l_vec, e_vec = impl->e_vecs_in[input_field]; + + // Get input vector + CeedCallBackend(CeedOperatorFieldGetVector(op_input_field, &l_vec)); + is_active = l_vec == CEED_VECTOR_ACTIVE; + if (is_active && skip_active) return CEED_ERROR_SUCCESS; + if (is_active) { + l_vec = in_vec; + if (!e_vec) e_vec = active_e_vec; + } - // Get input vector - CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); - if (vec == CEED_VECTOR_ACTIVE) { - if (skip_active) continue; - else vec = in_vec; - } + // Restriction action + if (e_vec) { + // Restrict, if necessary + if (!impl->skip_rstr_in[input_field]) { + uint64_t state; - CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); - if (eval_mode == CEED_EVAL_WEIGHT) { // Skip - } else { - // Get input vector - CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); - // Get input element restriction - CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr)); - if (vec == CEED_VECTOR_ACTIVE) vec = in_vec; - // Restrict, if necessary - if (!impl->e_vecs[i]) { - // No restriction for this field; read data directly from vec. - CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, (const CeedScalar **)&e_data[i])); - } else { - CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_NOTRANSPOSE, vec, impl->e_vecs[i], request)); - // Get evec - CeedCallBackend(CeedVectorGetArrayRead(impl->e_vecs[i], CEED_MEM_DEVICE, (const CeedScalar **)&e_data[i])); + CeedCallBackend(CeedVectorGetState(l_vec, &state)); + if (is_active || state != impl->input_states[input_field]) { + CeedElemRestriction elem_rstr; + + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_field, &elem_rstr)); + CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_NOTRANSPOSE, l_vec, e_vec, request)); + CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); } + impl->input_states[input_field] = state; } } + if (!is_active) CeedCallBackend(CeedVectorDestroy(&l_vec)); return CEED_ERROR_SUCCESS; } //------------------------------------------------------------------------------ // Input Basis Action //------------------------------------------------------------------------------ -static inline int CeedOperatorInputBasis_Hip(CeedInt num_elem, CeedQFunctionField *qf_input_fields, CeedOperatorField *op_input_fields, - CeedInt num_input_fields, const bool skip_active, CeedScalar *e_data[2 * CEED_FIELD_MAX], +static inline int CeedOperatorInputBasis_Hip(CeedOperatorField op_input_field, CeedQFunctionField qf_input_field, CeedInt input_field, + CeedVector in_vec, CeedVector active_e_vec, CeedInt num_elem, const bool skip_active, CeedOperator_Hip *impl) { + bool is_active = false; + CeedEvalMode eval_mode; + CeedVector l_vec, e_vec = impl->e_vecs_in[input_field], q_vec = impl->q_vecs_in[input_field]; + + // Skip active input + CeedCallBackend(CeedOperatorFieldGetVector(op_input_field, &l_vec)); + is_active = l_vec == CEED_VECTOR_ACTIVE; + if (is_active && skip_active) return CEED_ERROR_SUCCESS; + if (is_active) { + l_vec = in_vec; + if (!e_vec) e_vec = active_e_vec; + } + + // Basis action + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_field, &eval_mode)); + switch (eval_mode) { + case CEED_EVAL_NONE: { + const CeedScalar *e_vec_array; + + if (e_vec) { + CeedCallBackend(CeedVectorGetArrayRead(e_vec, CEED_MEM_DEVICE, &e_vec_array)); + } else { + CeedCallBackend(CeedVectorGetArrayRead(l_vec, CEED_MEM_DEVICE, &e_vec_array)); + } + CeedCallBackend(CeedVectorSetArray(q_vec, CEED_MEM_DEVICE, CEED_USE_POINTER, (CeedScalar *)e_vec_array)); + break; + } + case CEED_EVAL_INTERP: + case CEED_EVAL_GRAD: + case CEED_EVAL_DIV: + case CEED_EVAL_CURL: { + CeedBasis basis; + + CeedCallBackend(CeedOperatorFieldGetBasis(op_input_field, &basis)); + CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_NOTRANSPOSE, eval_mode, e_vec, q_vec)); + CeedCallBackend(CeedBasisDestroy(&basis)); + break; + } + case CEED_EVAL_WEIGHT: + break; // No action + } + if (!is_active) CeedCallBackend(CeedVectorDestroy(&l_vec)); + return CEED_ERROR_SUCCESS; +} + +//------------------------------------------------------------------------------ +// Restore Input Vectors +//------------------------------------------------------------------------------ +static inline int CeedOperatorInputRestore_Hip(CeedOperatorField op_input_field, CeedQFunctionField qf_input_field, CeedInt input_field, + CeedVector in_vec, CeedVector active_e_vec, const bool skip_active, CeedOperator_Hip *impl) { + bool is_active = false; + CeedEvalMode eval_mode; + CeedVector l_vec, e_vec = impl->e_vecs_in[input_field]; + + // Skip active input + CeedCallBackend(CeedOperatorFieldGetVector(op_input_field, &l_vec)); + is_active = l_vec == CEED_VECTOR_ACTIVE; + if (is_active && skip_active) return CEED_ERROR_SUCCESS; + if (is_active) { + l_vec = in_vec; + if (!e_vec) e_vec = active_e_vec; + } + + // Restore e-vec + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_field, &eval_mode)); + if (eval_mode == CEED_EVAL_NONE) { + const CeedScalar *e_vec_array; + + CeedCallBackend(CeedVectorTakeArray(impl->q_vecs_in[input_field], CEED_MEM_DEVICE, (CeedScalar **)&e_vec_array)); + if (e_vec) { + CeedCallBackend(CeedVectorRestoreArrayRead(e_vec, &e_vec_array)); + } else { + CeedCallBackend(CeedVectorRestoreArrayRead(l_vec, &e_vec_array)); + } + } + if (!is_active) CeedCallBackend(CeedVectorDestroy(&l_vec)); + return CEED_ERROR_SUCCESS; +} + +//------------------------------------------------------------------------------ +// Apply and add to output +//------------------------------------------------------------------------------ +static int CeedOperatorApplyAdd_Hip(CeedOperator op, CeedVector in_vec, CeedVector out_vec, CeedRequest *request) { + CeedInt Q, num_elem, num_input_fields, num_output_fields; + Ceed ceed; + CeedVector active_e_vec; + CeedQFunctionField *qf_input_fields, *qf_output_fields; + CeedQFunction qf; + CeedOperatorField *op_input_fields, *op_output_fields; + CeedOperator_Hip *impl; + + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); + CeedCallBackend(CeedOperatorGetData(op, &impl)); + CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); + CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q)); + CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem)); + CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); + CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); + + // Setup + CeedCallBackend(CeedOperatorSetup_Hip(op)); + + // Work vector + CeedCallBackend(CeedGetWorkVector(ceed, impl->max_active_e_vec_len, &active_e_vec)); + + // Process inputs for (CeedInt i = 0; i < num_input_fields; i++) { - CeedInt elem_size, size; - CeedEvalMode eval_mode; - CeedElemRestriction elem_rstr; - CeedBasis basis; + CeedInt field = impl->input_field_order[i]; - // Skip active input - if (skip_active) { - CeedVector vec; + CeedCallBackend(CeedOperatorInputRestrict_Hip(op_input_fields[field], qf_input_fields[field], field, in_vec, active_e_vec, false, impl, request)); + CeedCallBackend(CeedOperatorInputBasis_Hip(op_input_fields[field], qf_input_fields[field], field, in_vec, active_e_vec, num_elem, false, impl)); + } + + // Output pointers, as necessary + for (CeedInt i = 0; i < num_output_fields; i++) { + CeedEvalMode eval_mode; + + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); + if (eval_mode == CEED_EVAL_NONE) { + CeedScalar *e_vec_array; - CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); - if (vec == CEED_VECTOR_ACTIVE) continue; + CeedCallBackend(CeedVectorGetArrayWrite(impl->e_vecs_out[i], CEED_MEM_DEVICE, &e_vec_array)); + CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[i], CEED_MEM_DEVICE, CEED_USE_POINTER, e_vec_array)); } - // Get elem_size, eval_mode, size - CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr)); - CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); - CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); - CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &size)); + } + + // Q function + CeedCallBackend(CeedQFunctionApply(qf, num_elem * Q, impl->q_vecs_in, impl->q_vecs_out)); + + // Restore input arrays + for (CeedInt i = 0; i < num_input_fields; i++) { + CeedCallBackend(CeedOperatorInputRestore_Hip(op_input_fields[i], qf_input_fields[i], i, in_vec, active_e_vec, false, impl)); + } + + // Output basis and restriction + for (CeedInt i = 0; i < num_output_fields; i++) { + bool is_active = false; + CeedInt field = impl->output_field_order[i]; + CeedEvalMode eval_mode; + CeedVector l_vec, e_vec = impl->e_vecs_out[field], q_vec = impl->q_vecs_out[field]; + + // Output vector + CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[field], &l_vec)); + is_active = l_vec == CEED_VECTOR_ACTIVE; + if (is_active) { + l_vec = out_vec; + if (!e_vec) e_vec = active_e_vec; + } + // Basis action + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[field], &eval_mode)); switch (eval_mode) { case CEED_EVAL_NONE: - CeedCallBackend(CeedVectorSetArray(impl->q_vecs_in[i], CEED_MEM_DEVICE, CEED_USE_POINTER, e_data[i])); - break; + break; // No action case CEED_EVAL_INTERP: case CEED_EVAL_GRAD: case CEED_EVAL_DIV: - case CEED_EVAL_CURL: - CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); - CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_NOTRANSPOSE, eval_mode, impl->e_vecs[i], impl->q_vecs_in[i])); + case CEED_EVAL_CURL: { + CeedBasis basis; + + CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[field], &basis)); + if (impl->apply_add_basis_out[field]) { + CeedCallBackend(CeedBasisApplyAdd(basis, num_elem, CEED_TRANSPOSE, eval_mode, q_vec, e_vec)); + } else { + CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_TRANSPOSE, eval_mode, q_vec, e_vec)); + } + CeedCallBackend(CeedBasisDestroy(&basis)); break; - case CEED_EVAL_WEIGHT: - break; // No action + } + // LCOV_EXCL_START + case CEED_EVAL_WEIGHT: { + return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); + // LCOV_EXCL_STOP + } + } + + // Restore evec + if (eval_mode == CEED_EVAL_NONE) { + CeedScalar *e_vec_array; + + CeedCallBackend(CeedVectorTakeArray(impl->q_vecs_out[field], CEED_MEM_DEVICE, &e_vec_array)); + CeedCallBackend(CeedVectorRestoreArray(e_vec, &e_vec_array)); + } + + // Restrict + if (!impl->skip_rstr_out[field]) { + CeedElemRestriction elem_rstr; + + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[field], &elem_rstr)); + CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_TRANSPOSE, e_vec, l_vec, request)); + CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); } + if (!is_active) CeedCallBackend(CeedVectorDestroy(&l_vec)); } + + // Return work vector + CeedCallBackend(CeedRestoreWorkVector(ceed, &active_e_vec)); + CeedCallBackend(CeedDestroy(&ceed)); + CeedCallBackend(CeedQFunctionDestroy(&qf)); return CEED_ERROR_SUCCESS; } //------------------------------------------------------------------------------ -// Restore Input Vectors +// CeedOperator needs to connect all the named fields (be they active or passive) to the named inputs and outputs of its CeedQFunction. //------------------------------------------------------------------------------ -static inline int CeedOperatorRestoreInputs_Hip(CeedInt num_input_fields, CeedQFunctionField *qf_input_fields, CeedOperatorField *op_input_fields, - const bool skip_active, CeedScalar *e_data[2 * CEED_FIELD_MAX], CeedOperator_Hip *impl) { - for (CeedInt i = 0; i < num_input_fields; i++) { - CeedEvalMode eval_mode; - CeedVector vec; +static int CeedOperatorSetupAtPoints_Hip(CeedOperator op) { + bool is_setup_done; + CeedInt max_num_points = -1, num_elem, num_input_fields, num_output_fields; + CeedQFunctionField *qf_input_fields, *qf_output_fields; + CeedQFunction qf; + CeedOperatorField *op_input_fields, *op_output_fields; + CeedOperator_Hip *impl; + + CeedCallBackend(CeedOperatorIsSetupDone(op, &is_setup_done)); + if (is_setup_done) return CEED_ERROR_SUCCESS; - // Skip active input - if (skip_active) { - CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); - if (vec == CEED_VECTOR_ACTIVE) continue; + CeedCallBackend(CeedOperatorGetData(op, &impl)); + CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); + CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem)); + CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); + CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); + { + CeedElemRestriction rstr_points = NULL; + + CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL)); + CeedCallBackend(CeedElemRestrictionGetMaxPointsInElement(rstr_points, &max_num_points)); + CeedCallBackend(CeedCalloc(num_elem, &impl->num_points)); + for (CeedInt e = 0; e < num_elem; e++) { + CeedInt num_points_elem; + + CeedCallBackend(CeedElemRestrictionGetNumPointsInElement(rstr_points, e, &num_points_elem)); + impl->num_points[e] = num_points_elem; } - CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); - if (eval_mode == CEED_EVAL_WEIGHT) { // Skip - } else { - if (!impl->e_vecs[i]) { // This was a skip_restriction case - CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); - CeedCallBackend(CeedVectorRestoreArrayRead(vec, (const CeedScalar **)&e_data[i])); + CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points)); + } + impl->max_num_points = max_num_points; + + // Allocate + CeedCallBackend(CeedCalloc(num_input_fields, &impl->e_vecs_in)); + CeedCallBackend(CeedCalloc(num_output_fields, &impl->e_vecs_out)); + CeedCallBackend(CeedCalloc(num_input_fields, &impl->skip_rstr_in)); + CeedCallBackend(CeedCalloc(num_output_fields, &impl->skip_rstr_out)); + CeedCallBackend(CeedCalloc(num_output_fields, &impl->apply_add_basis_out)); + CeedCallBackend(CeedCalloc(num_input_fields, &impl->input_field_order)); + CeedCallBackend(CeedCalloc(num_output_fields, &impl->output_field_order)); + CeedCallBackend(CeedCalloc(num_input_fields, &impl->input_states)); + CeedCallBackend(CeedCalloc(num_input_fields, &impl->q_vecs_in)); + CeedCallBackend(CeedCalloc(num_output_fields, &impl->q_vecs_out)); + impl->num_inputs = num_input_fields; + impl->num_outputs = num_output_fields; + + // Set up infield and outfield e-vecs and q-vecs + CeedCallBackend(CeedOperatorSetupFields_Hip(qf, op, true, true, impl->skip_rstr_in, NULL, impl->e_vecs_in, impl->q_vecs_in, num_input_fields, + max_num_points, num_elem)); + CeedCallBackend(CeedOperatorSetupFields_Hip(qf, op, false, true, impl->skip_rstr_out, impl->apply_add_basis_out, impl->e_vecs_out, impl->q_vecs_out, + num_output_fields, max_num_points, num_elem)); + + // Reorder fields to allow reuse of buffers + impl->max_active_e_vec_len = 0; + { + bool is_ordered[CEED_FIELD_MAX]; + CeedInt curr_index = 0; + + for (CeedInt i = 0; i < num_input_fields; i++) is_ordered[i] = false; + for (CeedInt i = 0; i < num_input_fields; i++) { + CeedSize e_vec_len_i; + CeedVector vec_i; + CeedElemRestriction rstr_i; + + if (is_ordered[i]) continue; + is_ordered[i] = true; + impl->input_field_order[curr_index] = i; + curr_index++; + CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i)); + if (vec_i == CEED_VECTOR_NONE) { + // CEED_EVAL_WEIGHT + CeedCallBackend(CeedVectorDestroy(&vec_i)); + continue; + }; + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr_i)); + CeedCallBackend(CeedElemRestrictionGetEVectorSize(rstr_i, &e_vec_len_i)); + impl->max_active_e_vec_len = e_vec_len_i > impl->max_active_e_vec_len ? e_vec_len_i : impl->max_active_e_vec_len; + for (CeedInt j = i + 1; j < num_input_fields; j++) { + CeedVector vec_j; + CeedElemRestriction rstr_j; + + CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[j], &vec_j)); + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[j], &rstr_j)); + if (rstr_i == rstr_j && vec_i == vec_j) { + is_ordered[j] = true; + impl->input_field_order[curr_index] = j; + curr_index++; + } + CeedCallBackend(CeedVectorDestroy(&vec_j)); + CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j)); + } + CeedCallBackend(CeedVectorDestroy(&vec_i)); + CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i)); + } + } + { + bool is_ordered[CEED_FIELD_MAX]; + CeedInt curr_index = 0; + + for (CeedInt i = 0; i < num_output_fields; i++) is_ordered[i] = false; + for (CeedInt i = 0; i < num_output_fields; i++) { + CeedSize e_vec_len_i; + CeedVector vec_i; + CeedElemRestriction rstr_i; + + if (is_ordered[i]) continue; + is_ordered[i] = true; + impl->output_field_order[curr_index] = i; + curr_index++; + CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec_i)); + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &rstr_i)); + CeedCallBackend(CeedElemRestrictionGetEVectorSize(rstr_i, &e_vec_len_i)); + impl->max_active_e_vec_len = e_vec_len_i > impl->max_active_e_vec_len ? e_vec_len_i : impl->max_active_e_vec_len; + for (CeedInt j = i + 1; j < num_output_fields; j++) { + CeedVector vec_j; + CeedElemRestriction rstr_j; + + CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[j], &vec_j)); + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[j], &rstr_j)); + if (rstr_i == rstr_j && vec_i == vec_j) { + is_ordered[j] = true; + impl->output_field_order[curr_index] = j; + curr_index++; + } + CeedCallBackend(CeedVectorDestroy(&vec_j)); + CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j)); + } + CeedCallBackend(CeedVectorDestroy(&vec_i)); + CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i)); + } + } + CeedCallBackend(CeedClearWorkVectors(CeedOperatorReturnCeed(op), impl->max_active_e_vec_len)); + { + // Create two work vectors for diagonal assembly + CeedVector temp_1, temp_2; + + CeedCallBackend(CeedGetWorkVector(CeedOperatorReturnCeed(op), impl->max_active_e_vec_len, &temp_1)); + CeedCallBackend(CeedGetWorkVector(CeedOperatorReturnCeed(op), impl->max_active_e_vec_len, &temp_2)); + CeedCallBackend(CeedRestoreWorkVector(CeedOperatorReturnCeed(op), &temp_1)); + CeedCallBackend(CeedRestoreWorkVector(CeedOperatorReturnCeed(op), &temp_2)); + } + CeedCallBackend(CeedOperatorSetSetupDone(op)); + CeedCallBackend(CeedQFunctionDestroy(&qf)); + return CEED_ERROR_SUCCESS; +} + +//------------------------------------------------------------------------------ +// Input Basis Action AtPoints +//------------------------------------------------------------------------------ +static inline int CeedOperatorInputBasisAtPoints_Hip(CeedOperatorField op_input_field, CeedQFunctionField qf_input_field, CeedInt input_field, + CeedVector in_vec, CeedVector active_e_vec, CeedInt num_elem, const CeedInt *num_points, + const bool skip_active, const bool skip_passive, CeedOperator_Hip *impl) { + bool is_active = false; + CeedEvalMode eval_mode; + CeedVector l_vec, e_vec = impl->e_vecs_in[input_field], q_vec = impl->q_vecs_in[input_field]; + + // Skip active input + CeedCallBackend(CeedOperatorFieldGetVector(op_input_field, &l_vec)); + is_active = l_vec == CEED_VECTOR_ACTIVE; + if (skip_active && is_active) return CEED_ERROR_SUCCESS; + if (skip_passive && !is_active) { + CeedCallBackend(CeedVectorDestroy(&l_vec)); + return CEED_ERROR_SUCCESS; + } + if (is_active) { + l_vec = in_vec; + if (!e_vec) e_vec = active_e_vec; + } + + // Basis action + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_field, &eval_mode)); + switch (eval_mode) { + case CEED_EVAL_NONE: { + const CeedScalar *e_vec_array; + + if (e_vec) { + CeedCallBackend(CeedVectorGetArrayRead(e_vec, CEED_MEM_DEVICE, &e_vec_array)); } else { - CeedCallBackend(CeedVectorRestoreArrayRead(impl->e_vecs[i], (const CeedScalar **)&e_data[i])); + CeedCallBackend(CeedVectorGetArrayRead(l_vec, CEED_MEM_DEVICE, &e_vec_array)); } + CeedCallBackend(CeedVectorSetArray(q_vec, CEED_MEM_DEVICE, CEED_USE_POINTER, (CeedScalar *)e_vec_array)); + break; + } + case CEED_EVAL_INTERP: + case CEED_EVAL_GRAD: + case CEED_EVAL_DIV: + case CEED_EVAL_CURL: { + CeedBasis basis; + + CeedCallBackend(CeedOperatorFieldGetBasis(op_input_field, &basis)); + CeedCallBackend(CeedBasisApplyAtPoints(basis, num_elem, num_points, CEED_NOTRANSPOSE, eval_mode, impl->point_coords_elem, e_vec, q_vec)); + CeedCallBackend(CeedBasisDestroy(&basis)); + break; } + case CEED_EVAL_WEIGHT: + break; // No action } + if (!is_active) CeedCallBackend(CeedVectorDestroy(&l_vec)); return CEED_ERROR_SUCCESS; } //------------------------------------------------------------------------------ -// Apply and add to output +// Apply and add to output AtPoints //------------------------------------------------------------------------------ -static int CeedOperatorApplyAdd_Hip(CeedOperator op, CeedVector in_vec, CeedVector out_vec, CeedRequest *request) { - CeedInt Q, num_elem, elem_size, num_input_fields, num_output_fields, size; - CeedScalar *e_data[2 * CEED_FIELD_MAX] = {NULL}; +static int CeedOperatorApplyAddAtPoints_Hip(CeedOperator op, CeedVector in_vec, CeedVector out_vec, CeedRequest *request) { + CeedInt max_num_points, *num_points, num_elem, num_input_fields, num_output_fields; + Ceed ceed; + CeedVector active_e_vec; CeedQFunctionField *qf_input_fields, *qf_output_fields; CeedQFunction qf; CeedOperatorField *op_input_fields, *op_output_fields; CeedOperator_Hip *impl; + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); CeedCallBackend(CeedOperatorGetData(op, &impl)); CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); - CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q)); CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem)); CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); // Setup - CeedCallBackend(CeedOperatorSetup_Hip(op)); + CeedCallBackend(CeedOperatorSetupAtPoints_Hip(op)); + num_points = impl->num_points; + max_num_points = impl->max_num_points; - // Input Evecs and Restriction - CeedCallBackend(CeedOperatorSetupInputs_Hip(num_input_fields, qf_input_fields, op_input_fields, in_vec, false, e_data, impl, request)); + // Work vector + CeedCallBackend(CeedGetWorkVector(ceed, impl->max_active_e_vec_len, &active_e_vec)); - // Input basis apply if needed - CeedCallBackend(CeedOperatorInputBasis_Hip(num_elem, qf_input_fields, op_input_fields, num_input_fields, false, e_data, impl)); + // Get point coordinates + { + CeedVector point_coords = NULL; + CeedElemRestriction rstr_points = NULL; + + CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, &point_coords)); + if (!impl->point_coords_elem) CeedCallBackend(CeedElemRestrictionCreateVector(rstr_points, NULL, &impl->point_coords_elem)); + { + uint64_t state; + CeedCallBackend(CeedVectorGetState(point_coords, &state)); + if (impl->points_state != state) { + CeedCallBackend(CeedElemRestrictionApply(rstr_points, CEED_NOTRANSPOSE, point_coords, impl->point_coords_elem, request)); + } + } + CeedCallBackend(CeedVectorDestroy(&point_coords)); + CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points)); + } + + // Process inputs + for (CeedInt i = 0; i < num_input_fields; i++) { + CeedInt field = impl->input_field_order[i]; + + CeedCallBackend(CeedOperatorInputRestrict_Hip(op_input_fields[field], qf_input_fields[field], field, in_vec, active_e_vec, false, impl, request)); + CeedCallBackend(CeedOperatorInputBasisAtPoints_Hip(op_input_fields[field], qf_input_fields[field], field, in_vec, active_e_vec, num_elem, + num_points, false, false, impl)); + } // Output pointers, as necessary for (CeedInt i = 0; i < num_output_fields; i++) { @@ -359,68 +878,86 @@ static int CeedOperatorApplyAdd_Hip(CeedOperator op, CeedVector in_vec, CeedVect CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); if (eval_mode == CEED_EVAL_NONE) { - // Set the output Q-Vector to use the E-Vector data directly. - CeedCallBackend(CeedVectorGetArrayWrite(impl->e_vecs[i + impl->num_inputs], CEED_MEM_DEVICE, &e_data[i + num_input_fields])); - CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[i], CEED_MEM_DEVICE, CEED_USE_POINTER, e_data[i + num_input_fields])); + CeedScalar *e_vec_array; + + CeedCallBackend(CeedVectorGetArrayWrite(impl->e_vecs_out[i], CEED_MEM_DEVICE, &e_vec_array)); + CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[i], CEED_MEM_DEVICE, CEED_USE_POINTER, e_vec_array)); } } // Q function - CeedCallBackend(CeedQFunctionApply(qf, num_elem * Q, impl->q_vecs_in, impl->q_vecs_out)); + CeedCallBackend(CeedQFunctionApply(qf, num_elem * max_num_points, impl->q_vecs_in, impl->q_vecs_out)); - // Output basis apply if needed + // Restore input arrays + for (CeedInt i = 0; i < num_input_fields; i++) { + CeedCallBackend(CeedOperatorInputRestore_Hip(op_input_fields[i], qf_input_fields[i], i, in_vec, active_e_vec, false, impl)); + } + + // Output basis and restriction for (CeedInt i = 0; i < num_output_fields; i++) { - CeedEvalMode eval_mode; - CeedElemRestriction elem_rstr; - CeedBasis basis; + bool is_active = false; + CeedInt field = impl->output_field_order[i]; + CeedEvalMode eval_mode; + CeedVector l_vec, e_vec = impl->e_vecs_out[field], q_vec = impl->q_vecs_out[field]; + + // Output vector + CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[field], &l_vec)); + is_active = l_vec == CEED_VECTOR_ACTIVE; + if (is_active) { + l_vec = out_vec; + if (!e_vec) e_vec = active_e_vec; + } - // Get elem_size, eval_mode, size - CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr)); - CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); - CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); - CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &size)); // Basis action + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[field], &eval_mode)); switch (eval_mode) { case CEED_EVAL_NONE: break; // No action case CEED_EVAL_INTERP: case CEED_EVAL_GRAD: case CEED_EVAL_DIV: - case CEED_EVAL_CURL: - CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); - CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_TRANSPOSE, eval_mode, impl->q_vecs_out[i], impl->e_vecs[i + impl->num_inputs])); + case CEED_EVAL_CURL: { + CeedBasis basis; + + CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[field], &basis)); + if (impl->apply_add_basis_out[field]) { + CeedCallBackend(CeedBasisApplyAddAtPoints(basis, num_elem, num_points, CEED_TRANSPOSE, eval_mode, impl->point_coords_elem, q_vec, e_vec)); + } else { + CeedCallBackend(CeedBasisApplyAtPoints(basis, num_elem, num_points, CEED_TRANSPOSE, eval_mode, impl->point_coords_elem, q_vec, e_vec)); + } + CeedCallBackend(CeedBasisDestroy(&basis)); break; + } // LCOV_EXCL_START case CEED_EVAL_WEIGHT: { - return CeedError(CeedOperatorReturnCeed(op), CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); + return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); // LCOV_EXCL_STOP } } - } - - // Output restriction - for (CeedInt i = 0; i < num_output_fields; i++) { - CeedEvalMode eval_mode; - CeedVector vec; - CeedElemRestriction elem_rstr; // Restore evec - CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); if (eval_mode == CEED_EVAL_NONE) { - CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs[i + impl->num_inputs], &e_data[i + num_input_fields])); + CeedScalar *e_vec_array; + + CeedCallBackend(CeedVectorTakeArray(impl->q_vecs_out[field], CEED_MEM_DEVICE, &e_vec_array)); + CeedCallBackend(CeedVectorRestoreArray(e_vec, &e_vec_array)); } - // Get output vector - CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec)); + // Restrict - CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr)); - // Active - if (vec == CEED_VECTOR_ACTIVE) vec = out_vec; + if (!impl->skip_rstr_out[field]) { + CeedElemRestriction elem_rstr; - CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_TRANSPOSE, impl->e_vecs[i + impl->num_inputs], vec, request)); + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[field], &elem_rstr)); + CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_TRANSPOSE, e_vec, l_vec, request)); + CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); + } + if (!is_active) CeedCallBackend(CeedVectorDestroy(&l_vec)); } - // Restore input arrays - CeedCallBackend(CeedOperatorRestoreInputs_Hip(num_input_fields, qf_input_fields, op_input_fields, false, e_data, impl)); + // Restore work vector + CeedCallBackend(CeedRestoreWorkVector(ceed, &active_e_vec)); + CeedCallBackend(CeedDestroy(&ceed)); + CeedCallBackend(CeedQFunctionDestroy(&qf)); return CEED_ERROR_SUCCESS; } @@ -431,7 +968,7 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Hip(CeedOperator op, b CeedRequest *request) { Ceed ceed, ceed_parent; CeedInt num_active_in, num_active_out, Q, num_elem, num_input_fields, num_output_fields, size; - CeedScalar *assembled_array, *e_data[2 * CEED_FIELD_MAX] = {NULL}; + CeedScalar *assembled_array; CeedVector *active_inputs; CeedQFunctionField *qf_input_fields, *qf_output_fields; CeedQFunction qf; @@ -452,19 +989,21 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Hip(CeedOperator op, b // Setup CeedCallBackend(CeedOperatorSetup_Hip(op)); - // Input Evecs and Restriction - CeedCallBackend(CeedOperatorSetupInputs_Hip(num_input_fields, qf_input_fields, op_input_fields, NULL, true, e_data, impl, request)); + // Process inputs + for (CeedInt i = 0; i < num_input_fields; i++) { + CeedCallBackend(CeedOperatorInputRestrict_Hip(op_input_fields[i], qf_input_fields[i], i, NULL, NULL, true, impl, request)); + CeedCallBackend(CeedOperatorInputBasis_Hip(op_input_fields[i], qf_input_fields[i], i, NULL, NULL, num_elem, true, impl)); + } // Count number of active input fields if (!num_active_in) { for (CeedInt i = 0; i < num_input_fields; i++) { CeedScalar *q_vec_array; - CeedVector vec; + CeedVector l_vec; - // Get input vector - CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); // Check if active input - if (vec == CEED_VECTOR_ACTIVE) { + CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &l_vec)); + if (l_vec == CEED_VECTOR_ACTIVE) { CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &size)); CeedCallBackend(CeedVectorSetValue(impl->q_vecs_in[i], 0.0)); CeedCallBackend(CeedVectorGetArray(impl->q_vecs_in[i], CEED_MEM_DEVICE, &q_vec_array)); @@ -473,12 +1012,13 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Hip(CeedOperator op, b CeedSize q_size = (CeedSize)Q * num_elem; CeedCallBackend(CeedVectorCreate(ceed, q_size, &active_inputs[num_active_in + field])); - CeedCallBackend( - CeedVectorSetArray(active_inputs[num_active_in + field], CEED_MEM_DEVICE, CEED_USE_POINTER, &q_vec_array[field * Q * num_elem])); + CeedCallBackend(CeedVectorSetArray(active_inputs[num_active_in + field], CEED_MEM_DEVICE, CEED_USE_POINTER, + &q_vec_array[field * Q * num_elem])); } num_active_in += size; CeedCallBackend(CeedVectorRestoreArray(impl->q_vecs_in[i], &q_vec_array)); } + CeedCallBackend(CeedVectorDestroy(&l_vec)); } impl->num_active_in = num_active_in; impl->qf_active_in = active_inputs; @@ -487,15 +1027,15 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Hip(CeedOperator op, b // Count number of active output fields if (!num_active_out) { for (CeedInt i = 0; i < num_output_fields; i++) { - CeedVector vec; + CeedVector l_vec; - // Get output vector - CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec)); // Check if active output - if (vec == CEED_VECTOR_ACTIVE) { + CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &l_vec)); + if (l_vec == CEED_VECTOR_ACTIVE) { CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &size)); num_active_out += size; } + CeedCallBackend(CeedVectorDestroy(&l_vec)); } impl->num_active_out = num_active_out; } @@ -510,16 +1050,14 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Hip(CeedOperator op, b // Create output restriction CeedCallBackend(CeedElemRestrictionCreateStrided(ceed_parent, num_elem, Q, num_active_in * num_active_out, - num_active_in * num_active_out * num_elem * Q, strides, rstr)); + (CeedSize)num_active_in * (CeedSize)num_active_out * (CeedSize)num_elem * (CeedSize)Q, strides, + rstr)); // Create assembled vector CeedCallBackend(CeedVectorCreate(ceed_parent, l_size, assembled)); } CeedCallBackend(CeedVectorSetValue(*assembled, 0.0)); CeedCallBackend(CeedVectorGetArray(*assembled, CEED_MEM_DEVICE, &assembled_array)); - // Input basis apply - CeedCallBackend(CeedOperatorInputBasis_Hip(num_elem, qf_input_fields, op_input_fields, num_input_fields, true, e_data, impl)); - // Assemble QFunction for (CeedInt in = 0; in < num_active_in; in++) { // Set Inputs @@ -529,38 +1067,42 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Hip(CeedOperator op, b } // Set Outputs for (CeedInt out = 0; out < num_output_fields; out++) { - CeedVector vec; + CeedVector l_vec; - // Get output vector - CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[out], &vec)); // Check if active output - if (vec == CEED_VECTOR_ACTIVE) { + CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[out], &l_vec)); + if (l_vec == CEED_VECTOR_ACTIVE) { CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[out], CEED_MEM_DEVICE, CEED_USE_POINTER, assembled_array)); CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[out], &size)); assembled_array += size * Q * num_elem; // Advance the pointer by the size of the output } + CeedCallBackend(CeedVectorDestroy(&l_vec)); } // Apply QFunction CeedCallBackend(CeedQFunctionApply(qf, Q * num_elem, impl->q_vecs_in, impl->q_vecs_out)); } - // Un-set output q_vecs to prevent accidental overwrite of Assembled + // Un-set output q-vecs to prevent accidental overwrite of Assembled for (CeedInt out = 0; out < num_output_fields; out++) { - CeedVector vec; + CeedVector l_vec; - // Get output vector - CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[out], &vec)); - // Check if active output - if (vec == CEED_VECTOR_ACTIVE) { + CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[out], &l_vec)); + if (l_vec == CEED_VECTOR_ACTIVE) { CeedCallBackend(CeedVectorTakeArray(impl->q_vecs_out[out], CEED_MEM_DEVICE, NULL)); } + CeedCallBackend(CeedVectorDestroy(&l_vec)); } // Restore input arrays - CeedCallBackend(CeedOperatorRestoreInputs_Hip(num_input_fields, qf_input_fields, op_input_fields, true, e_data, impl)); + for (CeedInt i = 0; i < num_input_fields; i++) { + CeedCallBackend(CeedOperatorInputRestore_Hip(op_input_fields[i], qf_input_fields[i], i, NULL, NULL, true, impl)); + } // Restore output CeedCallBackend(CeedVectorRestoreArray(*assembled, &assembled_array)); + CeedCallBackend(CeedDestroy(&ceed)); + CeedCallBackend(CeedDestroy(&ceed_parent)); + CeedCallBackend(CeedQFunctionDestroy(&qf)); return CEED_ERROR_SUCCESS; } @@ -604,13 +1146,14 @@ static inline int CeedOperatorAssembleDiagonalSetup_Hip(CeedOperator op) { CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec)); if (vec == CEED_VECTOR_ACTIVE) { - CeedBasis basis; CeedEvalMode eval_mode; + CeedBasis basis; CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis)); CeedCheck(!basis_in || basis_in == basis, ceed, CEED_ERROR_BACKEND, "Backend does not implement operator diagonal assembly with multiple active bases"); - basis_in = basis; + if (!basis_in) CeedCallBackend(CeedBasisReferenceCopy(basis, &basis_in)); + CeedCallBackend(CeedBasisDestroy(&basis)); CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode)); CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis_in, eval_mode, &q_comp)); if (eval_mode != CEED_EVAL_WEIGHT) { @@ -620,6 +1163,7 @@ static inline int CeedOperatorAssembleDiagonalSetup_Hip(CeedOperator op) { num_eval_modes_in += q_comp; } } + CeedCallBackend(CeedVectorDestroy(&vec)); } // Determine active output basis @@ -636,7 +1180,8 @@ static inline int CeedOperatorAssembleDiagonalSetup_Hip(CeedOperator op) { CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis)); CeedCheck(!basis_out || basis_out == basis, ceed, CEED_ERROR_BACKEND, "Backend does not implement operator diagonal assembly with multiple active bases"); - basis_out = basis; + if (!basis_out) CeedCallBackend(CeedBasisReferenceCopy(basis, &basis_out)); + CeedCallBackend(CeedBasisDestroy(&basis)); CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode)); CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis_out, eval_mode, &q_comp)); if (eval_mode != CEED_EVAL_WEIGHT) { @@ -646,6 +1191,7 @@ static inline int CeedOperatorAssembleDiagonalSetup_Hip(CeedOperator op) { num_eval_modes_out += q_comp; } } + CeedCallBackend(CeedVectorDestroy(&vec)); } // Operator data struct @@ -757,6 +1303,10 @@ static inline int CeedOperatorAssembleDiagonalSetup_Hip(CeedOperator op) { CeedCallHip(ceed, hipMemcpy(diag->d_eval_modes_out, eval_modes_out, num_eval_modes_out * eval_modes_bytes, hipMemcpyHostToDevice)); CeedCallBackend(CeedFree(&eval_modes_in)); CeedCallBackend(CeedFree(&eval_modes_out)); + CeedCallBackend(CeedDestroy(&ceed)); + CeedCallBackend(CeedBasisDestroy(&basis_in)); + CeedCallBackend(CeedBasisDestroy(&basis_out)); + CeedCallBackend(CeedQFunctionDestroy(&qf)); return CEED_ERROR_SUCCESS; } @@ -765,8 +1315,6 @@ static inline int CeedOperatorAssembleDiagonalSetup_Hip(CeedOperator op) { //------------------------------------------------------------------------------ static inline int CeedOperatorAssembleDiagonalSetupCompile_Hip(CeedOperator op, CeedInt use_ceedsize_idx, const bool is_point_block) { Ceed ceed; - char *diagonal_kernel_source; - const char *diagonal_kernel_path; CeedInt num_input_fields, num_output_fields, num_eval_modes_in = 0, num_eval_modes_out = 0; CeedInt num_comp, q_comp, num_nodes, num_qpts; CeedBasis basis_in = NULL, basis_out = NULL; @@ -788,14 +1336,18 @@ static inline int CeedOperatorAssembleDiagonalSetupCompile_Hip(CeedOperator op, CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec)); if (vec == CEED_VECTOR_ACTIVE) { CeedEvalMode eval_mode; + CeedBasis basis; - CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis_in)); + CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis)); + if (!basis_in) CeedCallBackend(CeedBasisReferenceCopy(basis, &basis_in)); + CeedCallBackend(CeedBasisDestroy(&basis)); CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode)); CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis_in, eval_mode, &q_comp)); if (eval_mode != CEED_EVAL_WEIGHT) { num_eval_modes_in += q_comp; } } + CeedCallBackend(CeedVectorDestroy(&vec)); } // Determine active output basis @@ -807,14 +1359,18 @@ static inline int CeedOperatorAssembleDiagonalSetupCompile_Hip(CeedOperator op, CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec)); if (vec == CEED_VECTOR_ACTIVE) { CeedEvalMode eval_mode; + CeedBasis basis; - CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis_out)); + CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis)); + if (!basis_out) CeedCallBackend(CeedBasisReferenceCopy(basis, &basis_out)); + CeedCallBackend(CeedBasisDestroy(&basis)); CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode)); CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis_out, eval_mode, &q_comp)); if (eval_mode != CEED_EVAL_WEIGHT) { num_eval_modes_out += q_comp; } } + CeedCallBackend(CeedVectorDestroy(&vec)); } // Operator data struct @@ -822,22 +1378,22 @@ static inline int CeedOperatorAssembleDiagonalSetupCompile_Hip(CeedOperator op, CeedOperatorDiag_Hip *diag = impl->diag; // Assemble kernel - hipModule_t *module = is_point_block ? &diag->module_point_block : &diag->module; - CeedInt elems_per_block = 1; + const char diagonal_kernel_source[] = "// Diagonal assembly source\n#include \n"; + hipModule_t *module = is_point_block ? &diag->module_point_block : &diag->module; + CeedInt elems_per_block = 1; + CeedCallBackend(CeedBasisGetNumNodes(basis_in, &num_nodes)); CeedCallBackend(CeedBasisGetNumComponents(basis_in, &num_comp)); if (basis_in == CEED_BASIS_NONE) num_qpts = num_nodes; else CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis_in, &num_qpts)); - CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-ref-operator-assemble-diagonal.h", &diagonal_kernel_path)); - CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Diagonal Assembly Kernel Source -----\n"); - CeedCallBackend(CeedLoadSourceToBuffer(ceed, diagonal_kernel_path, &diagonal_kernel_source)); - CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Diagonal Assembly Source Complete! -----\n"); CeedCallHip(ceed, CeedCompile_Hip(ceed, diagonal_kernel_source, module, 8, "NUM_EVAL_MODES_IN", num_eval_modes_in, "NUM_EVAL_MODES_OUT", num_eval_modes_out, "NUM_COMP", num_comp, "NUM_NODES", num_nodes, "NUM_QPTS", num_qpts, "USE_CEEDSIZE", use_ceedsize_idx, "USE_POINT_BLOCK", is_point_block ? 1 : 0, "BLOCK_SIZE", num_nodes * elems_per_block)); CeedCallHip(ceed, CeedGetKernel_Hip(ceed, *module, "LinearDiagonal", is_point_block ? &diag->LinearPointBlock : &diag->LinearDiagonal)); - CeedCallBackend(CeedFree(&diagonal_kernel_path)); - CeedCallBackend(CeedFree(&diagonal_kernel_source)); + CeedCallBackend(CeedDestroy(&ceed)); + CeedCallBackend(CeedBasisDestroy(&basis_in)); + CeedCallBackend(CeedBasisDestroy(&basis_out)); + CeedCallBackend(CeedQFunctionDestroy(&qf)); return CEED_ERROR_SUCCESS; } @@ -889,6 +1445,8 @@ static inline int CeedOperatorAssembleDiagonalCore_Hip(CeedOperator op, CeedVect CeedCallBackend(CeedOperatorCreateActivePointBlockRestriction(rstr_out, &diag->point_block_diag_rstr)); CeedCallBackend(CeedElemRestrictionCreateVector(diag->point_block_diag_rstr, NULL, &diag->point_block_elem_diag)); } + CeedCallBackend(CeedElemRestrictionDestroy(&rstr_in)); + CeedCallBackend(CeedElemRestrictionDestroy(&rstr_out)); diag_rstr = is_point_block ? diag->point_block_diag_rstr : diag->diag_rstr; elem_diag = is_point_block ? diag->point_block_elem_diag : diag->elem_diag; CeedCallBackend(CeedVectorSetValue(elem_diag, 0.0)); @@ -897,8 +1455,8 @@ static inline int CeedOperatorAssembleDiagonalCore_Hip(CeedOperator op, CeedVect CeedCallBackend(CeedElemRestrictionGetElementSize(diag_rstr, &num_nodes)); if (num_nodes > 0) { // Assemble element operator diagonals - CeedCallBackend(CeedVectorGetArray(elem_diag, CEED_MEM_DEVICE, &elem_diag_array)); CeedCallBackend(CeedElemRestrictionGetNumElements(diag_rstr, &num_elem)); + CeedCallBackend(CeedVectorGetArray(elem_diag, CEED_MEM_DEVICE, &elem_diag_array)); // Compute the diagonal of B^T D B CeedInt elems_per_block = 1; @@ -922,6 +1480,7 @@ static inline int CeedOperatorAssembleDiagonalCore_Hip(CeedOperator op, CeedVect CeedCallBackend(CeedElemRestrictionApply(diag_rstr, CEED_TRANSPOSE, elem_diag, assembled, request)); // Cleanup + CeedCallBackend(CeedDestroy(&ceed)); CeedCallBackend(CeedVectorDestroy(&assembled_qf)); return CEED_ERROR_SUCCESS; } @@ -945,10 +1504,9 @@ static int CeedOperatorLinearAssembleAddPointBlockDiagonal_Hip(CeedOperator op, //------------------------------------------------------------------------------ // Single Operator Assembly Setup //------------------------------------------------------------------------------ -static int CeedSingleOperatorAssembleSetup_Hip(CeedOperator op, CeedInt use_ceedsize_idx) { +static int CeedOperatorAssembleSingleSetup_Hip(CeedOperator op, CeedInt use_ceedsize_idx) { Ceed ceed; - char *assembly_kernel_source; - const char *assembly_kernel_path; + Ceed_Hip *hip_data; CeedInt num_input_fields, num_output_fields, num_eval_modes_in = 0, num_eval_modes_out = 0; CeedInt elem_size_in, num_qpts_in = 0, num_comp_in, elem_size_out, num_qpts_out, num_comp_out, q_comp; CeedEvalMode *eval_modes_in = NULL, *eval_modes_out = NULL; @@ -973,13 +1531,17 @@ static int CeedSingleOperatorAssembleSetup_Hip(CeedOperator op, CeedInt use_ceed CeedCallBackend(CeedOperatorFieldGetVector(input_fields[i], &vec)); if (vec == CEED_VECTOR_ACTIVE) { - CeedBasis basis; - CeedEvalMode eval_mode; + CeedEvalMode eval_mode; + CeedElemRestriction elem_rstr; + CeedBasis basis; CeedCallBackend(CeedOperatorFieldGetBasis(input_fields[i], &basis)); CeedCheck(!basis_in || basis_in == basis, ceed, CEED_ERROR_BACKEND, "Backend does not implement operator assembly with multiple active bases"); - basis_in = basis; - CeedCallBackend(CeedOperatorFieldGetElemRestriction(input_fields[i], &rstr_in)); + if (!basis_in) CeedCallBackend(CeedBasisReferenceCopy(basis, &basis_in)); + CeedCallBackend(CeedBasisDestroy(&basis)); + CeedCallBackend(CeedOperatorFieldGetElemRestriction(input_fields[i], &elem_rstr)); + if (!rstr_in) CeedCallBackend(CeedElemRestrictionReferenceCopy(elem_rstr, &rstr_in)); + CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); CeedCallBackend(CeedElemRestrictionGetElementSize(rstr_in, &elem_size_in)); if (basis_in == CEED_BASIS_NONE) num_qpts_in = elem_size_in; else CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis_in, &num_qpts_in)); @@ -994,6 +1556,7 @@ static int CeedSingleOperatorAssembleSetup_Hip(CeedOperator op, CeedInt use_ceed num_eval_modes_in += q_comp; } } + CeedCallBackend(CeedVectorDestroy(&vec)); } // Determine active output basis; basis_out and rstr_out only used if same as input, TODO @@ -1003,14 +1566,18 @@ static int CeedSingleOperatorAssembleSetup_Hip(CeedOperator op, CeedInt use_ceed CeedCallBackend(CeedOperatorFieldGetVector(output_fields[i], &vec)); if (vec == CEED_VECTOR_ACTIVE) { - CeedBasis basis; - CeedEvalMode eval_mode; + CeedEvalMode eval_mode; + CeedElemRestriction elem_rstr; + CeedBasis basis; CeedCallBackend(CeedOperatorFieldGetBasis(output_fields[i], &basis)); CeedCheck(!basis_out || basis_out == basis, ceed, CEED_ERROR_BACKEND, "Backend does not implement operator assembly with multiple active bases"); - basis_out = basis; - CeedCallBackend(CeedOperatorFieldGetElemRestriction(output_fields[i], &rstr_out)); + if (!basis_out) CeedCallBackend(CeedBasisReferenceCopy(basis, &basis_out)); + CeedCallBackend(CeedBasisDestroy(&basis)); + CeedCallBackend(CeedOperatorFieldGetElemRestriction(output_fields[i], &elem_rstr)); + if (!rstr_out) CeedCallBackend(CeedElemRestrictionReferenceCopy(elem_rstr, &rstr_out)); + CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); CeedCallBackend(CeedElemRestrictionGetElementSize(rstr_out, &elem_size_out)); if (basis_out == CEED_BASIS_NONE) num_qpts_out = elem_size_out; else CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis_out, &num_qpts_out)); @@ -1027,6 +1594,7 @@ static int CeedSingleOperatorAssembleSetup_Hip(CeedOperator op, CeedInt use_ceed num_eval_modes_out += q_comp; } } + CeedCallBackend(CeedVectorDestroy(&vec)); } CeedCheck(num_eval_modes_in > 0 && num_eval_modes_out > 0, ceed, CEED_ERROR_UNSUPPORTED, "Cannot assemble operator without inputs/outputs"); @@ -1036,7 +1604,8 @@ static int CeedSingleOperatorAssembleSetup_Hip(CeedOperator op, CeedInt use_ceed asmb->block_size_x = elem_size_in; asmb->block_size_y = elem_size_out; - bool fallback = asmb->block_size_x * asmb->block_size_y * asmb->elems_per_block > 1024; + CeedCallBackend(CeedGetData(ceed, &hip_data)); + bool fallback = asmb->block_size_x * asmb->block_size_y * asmb->elems_per_block > hip_data->device_prop.maxThreadsPerBlock; if (fallback) { // Use fallback kernel with 1D threadblock @@ -1044,20 +1613,16 @@ static int CeedSingleOperatorAssembleSetup_Hip(CeedOperator op, CeedInt use_ceed } // Compile kernels + const char assembly_kernel_source[] = "// Full assembly source\n#include \n"; + CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr_in, &num_comp_in)); CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr_out, &num_comp_out)); - CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-ref-operator-assemble.h", &assembly_kernel_path)); - CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Assembly Kernel Source -----\n"); - CeedCallBackend(CeedLoadSourceToBuffer(ceed, assembly_kernel_path, &assembly_kernel_source)); - CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Assembly Source Complete! -----\n"); CeedCallBackend(CeedCompile_Hip(ceed, assembly_kernel_source, &asmb->module, 10, "NUM_EVAL_MODES_IN", num_eval_modes_in, "NUM_EVAL_MODES_OUT", num_eval_modes_out, "NUM_COMP_IN", num_comp_in, "NUM_COMP_OUT", num_comp_out, "NUM_NODES_IN", elem_size_in, "NUM_NODES_OUT", elem_size_out, "NUM_QPTS", num_qpts_in, "BLOCK_SIZE", asmb->block_size_x * asmb->block_size_y * asmb->elems_per_block, "BLOCK_SIZE_Y", asmb->block_size_y, "USE_CEEDSIZE", use_ceedsize_idx)); CeedCallBackend(CeedGetKernel_Hip(ceed, asmb->module, "LinearAssemble", &asmb->LinearAssemble)); - CeedCallBackend(CeedFree(&assembly_kernel_path)); - CeedCallBackend(CeedFree(&assembly_kernel_source)); // Load into B_in, in order that they will be used in eval_modes_in { @@ -1090,11 +1655,9 @@ static int CeedSingleOperatorAssembleSetup_Hip(CeedOperator op, CeedInt use_ceed CeedCallHip(ceed, hipMemcpy(&asmb->d_B_in[i * elem_size_in * num_qpts_in], h_B_in, elem_size_in * num_qpts_in * sizeof(CeedScalar), hipMemcpyHostToDevice)); } - - if (identity) { - CeedCallBackend(CeedFree(&identity)); - } + CeedCallBackend(CeedFree(&identity)); } + CeedCallBackend(CeedFree(&eval_modes_in)); // Load into B_out, in order that they will be used in eval_modes_out { @@ -1127,11 +1690,15 @@ static int CeedSingleOperatorAssembleSetup_Hip(CeedOperator op, CeedInt use_ceed CeedCallHip(ceed, hipMemcpy(&asmb->d_B_out[i * elem_size_out * num_qpts_out], h_B_out, elem_size_out * num_qpts_out * sizeof(CeedScalar), hipMemcpyHostToDevice)); } - - if (identity) { - CeedCallBackend(CeedFree(&identity)); - } + CeedCallBackend(CeedFree(&identity)); } + CeedCallBackend(CeedFree(&eval_modes_out)); + CeedCallBackend(CeedDestroy(&ceed)); + CeedCallBackend(CeedElemRestrictionDestroy(&rstr_in)); + CeedCallBackend(CeedElemRestrictionDestroy(&rstr_out)); + CeedCallBackend(CeedBasisDestroy(&basis_in)); + CeedCallBackend(CeedBasisDestroy(&basis_out)); + CeedCallBackend(CeedQFunctionDestroy(&qf)); return CEED_ERROR_SUCCESS; } @@ -1139,11 +1706,11 @@ static int CeedSingleOperatorAssembleSetup_Hip(CeedOperator op, CeedInt use_ceed // Assemble matrix data for COO matrix of assembled operator. // The sparsity pattern is set by CeedOperatorLinearAssembleSymbolic. // -// Note that this (and other assembly routines) currently assume only one active input restriction/basis per operator (could have multiple basis eval -// modes). +// Note that this (and other assembly routines) currently assume only one active input restriction/basis per operator +// (could have multiple basis eval modes). // TODO: allow multiple active input restrictions/basis objects //------------------------------------------------------------------------------ -static int CeedSingleOperatorAssemble_Hip(CeedOperator op, CeedInt offset, CeedVector values) { +static int CeedOperatorAssembleSingle_Hip(CeedOperator op, CeedInt offset, CeedVector values) { Ceed ceed; CeedSize values_length = 0, assembled_qf_length = 0; CeedInt use_ceedsize_idx = 0, num_elem_in, num_elem_out, elem_size_in, elem_size_out; @@ -1169,7 +1736,7 @@ static int CeedSingleOperatorAssemble_Hip(CeedOperator op, CeedInt offset, CeedV if ((values_length > INT_MAX) || (assembled_qf_length > INT_MAX)) use_ceedsize_idx = 1; // Setup - if (!impl->asmb) CeedCallBackend(CeedSingleOperatorAssembleSetup_Hip(op, use_ceedsize_idx)); + if (!impl->asmb) CeedCallBackend(CeedOperatorAssembleSingleSetup_Hip(op, use_ceedsize_idx)); CeedOperatorAssemble_Hip *asmb = impl->asmb; assert(asmb != NULL); @@ -1215,8 +1782,8 @@ static int CeedSingleOperatorAssemble_Hip(CeedOperator op, CeedInt offset, CeedV void *args[] = {(void *)&num_elem_in, &asmb->d_B_in, &asmb->d_B_out, &orients_in, &curl_orients_in, &orients_out, &curl_orients_out, &assembled_qf_array, &values_array}; - CeedCallBackend( - CeedRunKernelDimShared_Hip(ceed, asmb->LinearAssemble, grid, asmb->block_size_x, asmb->block_size_y, asmb->elems_per_block, shared_mem, args)); + CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, asmb->LinearAssemble, NULL, grid, asmb->block_size_x, asmb->block_size_y, asmb->elems_per_block, + shared_mem, args)); // Restore arrays CeedCallBackend(CeedVectorRestoreArray(values, &values_array)); @@ -1236,6 +1803,272 @@ static int CeedSingleOperatorAssemble_Hip(CeedOperator op, CeedInt offset, CeedV CeedCallBackend(CeedElemRestrictionRestoreCurlOrientations(rstr_out, &curl_orients_out)); } } + CeedCallBackend(CeedDestroy(&ceed)); + CeedCallBackend(CeedElemRestrictionDestroy(&rstr_in)); + CeedCallBackend(CeedElemRestrictionDestroy(&rstr_out)); + return CEED_ERROR_SUCCESS; +} + +//------------------------------------------------------------------------------ +// Assemble Linear QFunction AtPoints +//------------------------------------------------------------------------------ +static int CeedOperatorLinearAssembleQFunctionAtPoints_Hip(CeedOperator op, CeedVector *assembled, CeedElemRestriction *rstr, CeedRequest *request) { + return CeedError(CeedOperatorReturnCeed(op), CEED_ERROR_BACKEND, "Backend does not implement CeedOperatorLinearAssembleQFunction"); +} + +//------------------------------------------------------------------------------ +// Assemble Linear Diagonal AtPoints +//------------------------------------------------------------------------------ +static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Hip(CeedOperator op, CeedVector assembled, CeedRequest *request) { + CeedInt max_num_points, *num_points, num_elem, num_input_fields, num_output_fields; + Ceed ceed; + CeedVector active_e_vec_in, active_e_vec_out; + CeedQFunctionField *qf_input_fields, *qf_output_fields; + CeedQFunction qf; + CeedOperatorField *op_input_fields, *op_output_fields; + CeedOperator_Hip *impl; + + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); + CeedCallBackend(CeedOperatorGetData(op, &impl)); + CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); + CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem)); + CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); + CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); + + // Setup + CeedCallBackend(CeedOperatorSetupAtPoints_Hip(op)); + num_points = impl->num_points; + max_num_points = impl->max_num_points; + + // Work vector + CeedCallBackend(CeedGetWorkVector(ceed, impl->max_active_e_vec_len, &active_e_vec_in)); + CeedCallBackend(CeedGetWorkVector(ceed, impl->max_active_e_vec_len, &active_e_vec_out)); + { + CeedSize length_in, length_out; + + CeedCallBackend(CeedVectorGetLength(active_e_vec_in, &length_in)); + CeedCallBackend(CeedVectorGetLength(active_e_vec_out, &length_out)); + // Need input e_vec to be longer + if (length_in < length_out) { + CeedVector temp = active_e_vec_in; + + active_e_vec_in = active_e_vec_out; + active_e_vec_out = temp; + } + } + + // Get point coordinates + { + CeedVector point_coords = NULL; + CeedElemRestriction rstr_points = NULL; + + CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, &point_coords)); + if (!impl->point_coords_elem) CeedCallBackend(CeedElemRestrictionCreateVector(rstr_points, NULL, &impl->point_coords_elem)); + { + uint64_t state; + CeedCallBackend(CeedVectorGetState(point_coords, &state)); + if (impl->points_state != state) { + CeedCallBackend(CeedElemRestrictionApply(rstr_points, CEED_NOTRANSPOSE, point_coords, impl->point_coords_elem, request)); + } + } + CeedCallBackend(CeedVectorDestroy(&point_coords)); + CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points)); + } + + // Process inputs + for (CeedInt i = 0; i < num_input_fields; i++) { + CeedCallBackend(CeedOperatorInputRestrict_Hip(op_input_fields[i], qf_input_fields[i], i, NULL, NULL, true, impl, request)); + CeedCallBackend(CeedOperatorInputBasisAtPoints_Hip(op_input_fields[i], qf_input_fields[i], i, NULL, NULL, num_elem, num_points, true, false, + impl)); + } + + // Output pointers, as necessary + for (CeedInt i = 0; i < num_output_fields; i++) { + CeedEvalMode eval_mode; + + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); + if (eval_mode == CEED_EVAL_NONE) { + CeedScalar *e_vec_array; + + CeedCallBackend(CeedVectorGetArrayWrite(impl->e_vecs_out[i], CEED_MEM_DEVICE, &e_vec_array)); + CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[i], CEED_MEM_DEVICE, CEED_USE_POINTER, e_vec_array)); + } + } + + // Loop over active fields + for (CeedInt i = 0; i < num_input_fields; i++) { + bool is_active = false, is_active_at_points = true; + CeedInt elem_size = 1, num_comp_active = 1, e_vec_size = 0, field_in = impl->input_field_order[i]; + CeedRestrictionType rstr_type; + CeedVector l_vec; + CeedElemRestriction elem_rstr; + + // -- Skip non-active input + CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[field_in], &l_vec)); + is_active = l_vec == CEED_VECTOR_ACTIVE; + CeedCallBackend(CeedVectorDestroy(&l_vec)); + if (!is_active || impl->skip_rstr_in[field_in]) continue; + + // -- Get active restriction type + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[field_in], &elem_rstr)); + CeedCallBackend(CeedElemRestrictionGetType(elem_rstr, &rstr_type)); + is_active_at_points = rstr_type == CEED_RESTRICTION_POINTS; + if (!is_active_at_points) CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); + else elem_size = max_num_points; + CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp_active)); + CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); + + e_vec_size = elem_size * num_comp_active; + CeedCallBackend(CeedVectorSetValue(active_e_vec_in, 0.0)); + for (CeedInt s = 0; s < e_vec_size; s++) { + CeedVector q_vec = impl->q_vecs_in[field_in]; + + // Update unit vector + { + // Note: E-vec strides are node * (1) + comp * (elem_size * num_elem) + elem * (elem_size) + CeedInt node = (s - 1) % elem_size, comp = (s - 1) / elem_size; + CeedSize start = node * 1 + comp * (elem_size * num_elem); + CeedSize stop = (comp + 1) * (elem_size * num_elem); + + if (s != 0) CeedCallBackend(CeedVectorSetValueStrided(active_e_vec_in, start, stop, elem_size, 0.0)); + + node = s % elem_size, comp = s / elem_size; + start = node * 1 + comp * (elem_size * num_elem); + stop = (comp + 1) * (elem_size * num_elem); + CeedCallBackend(CeedVectorSetValueStrided(active_e_vec_in, start, stop, elem_size, 1.0)); + } + + // Basis action + for (CeedInt j = 0; j < num_input_fields; j++) { + CeedInt field = impl->input_field_order[j]; + + CeedCallBackend(CeedOperatorInputBasisAtPoints_Hip(op_input_fields[field], qf_input_fields[field], field, NULL, active_e_vec_in, num_elem, + num_points, false, true, impl)); + } + + // Q function + CeedCallBackend(CeedQFunctionApply(qf, num_elem * max_num_points, impl->q_vecs_in, impl->q_vecs_out)); + + // Output basis apply if needed + for (CeedInt j = 0; j < num_output_fields; j++) { + bool is_active = false; + CeedInt elem_size = 0; + CeedInt field_out = impl->output_field_order[j]; + CeedRestrictionType rstr_type; + CeedEvalMode eval_mode; + CeedVector l_vec, e_vec = impl->e_vecs_out[field_out], q_vec = impl->q_vecs_out[field_out]; + CeedElemRestriction elem_rstr; + + // ---- Skip non-active output + CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[field_out], &l_vec)); + is_active = l_vec == CEED_VECTOR_ACTIVE; + CeedCallBackend(CeedVectorDestroy(&l_vec)); + if (!is_active) continue; + if (!e_vec) e_vec = active_e_vec_out; + + // ---- Check if elem size matches + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[field_out], &elem_rstr)); + CeedCallBackend(CeedElemRestrictionGetType(elem_rstr, &rstr_type)); + if (is_active_at_points && rstr_type != CEED_RESTRICTION_POINTS) continue; + if (rstr_type == CEED_RESTRICTION_POINTS) { + CeedCallBackend(CeedElemRestrictionGetMaxPointsInElement(elem_rstr, &elem_size)); + } else { + CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); + } + { + CeedInt num_comp = 0; + + CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); + if (e_vec_size != num_comp * elem_size) continue; + } + + // Basis action + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[field_out], &eval_mode)); + switch (eval_mode) { + case CEED_EVAL_NONE: { + CeedScalar *e_vec_array; + + CeedCallBackend(CeedVectorTakeArray(q_vec, CEED_MEM_DEVICE, &e_vec_array)); + CeedCallBackend(CeedVectorRestoreArray(e_vec, &e_vec_array)); + break; + } + case CEED_EVAL_INTERP: + case CEED_EVAL_GRAD: + case CEED_EVAL_DIV: + case CEED_EVAL_CURL: { + CeedBasis basis; + + CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[field_out], &basis)); + if (impl->apply_add_basis_out[field_out]) { + CeedCallBackend(CeedBasisApplyAddAtPoints(basis, num_elem, num_points, CEED_TRANSPOSE, eval_mode, impl->point_coords_elem, q_vec, + e_vec)); + } else { + CeedCallBackend(CeedBasisApplyAtPoints(basis, num_elem, num_points, CEED_TRANSPOSE, eval_mode, impl->point_coords_elem, q_vec, e_vec)); + } + CeedCallBackend(CeedBasisDestroy(&basis)); + break; + } + // LCOV_EXCL_START + case CEED_EVAL_WEIGHT: { + return CeedError(CeedOperatorReturnCeed(op), CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); + // LCOV_EXCL_STOP + } + } + + // Continue if a field that is summed into + if (impl->skip_rstr_out[field_out]) { + CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); + continue; + } + + // Mask output e-vec + CeedCallBackend(CeedVectorPointwiseMult(e_vec, active_e_vec_in, e_vec)); + + // Restrict + CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_TRANSPOSE, e_vec, assembled, request)); + CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); + + // Reset q_vec for + if (eval_mode == CEED_EVAL_NONE) { + CeedScalar *e_vec_array; + + CeedCallBackend(CeedVectorGetArrayWrite(e_vec, CEED_MEM_DEVICE, &e_vec_array)); + CeedCallBackend(CeedVectorSetArray(q_vec, CEED_MEM_DEVICE, CEED_USE_POINTER, e_vec_array)); + } + } + + // Reset vec + if (s == e_vec_size - 1 && i != num_input_fields - 1) CeedCallBackend(CeedVectorSetValue(q_vec, 0.0)); + } + } + + // Restore CEED_EVAL_NONE + for (CeedInt i = 0; i < num_output_fields; i++) { + CeedEvalMode eval_mode; + + // Get eval_mode + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); + + // Restore evec + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); + if (eval_mode == CEED_EVAL_NONE) { + CeedScalar *e_vec_array; + + CeedCallBackend(CeedVectorTakeArray(impl->q_vecs_out[i], CEED_MEM_DEVICE, &e_vec_array)); + CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs_out[i], &e_vec_array)); + } + } + + // Restore input arrays + for (CeedInt i = 0; i < num_input_fields; i++) { + CeedCallBackend(CeedOperatorInputRestore_Hip(op_input_fields[i], qf_input_fields[i], i, NULL, NULL, true, impl)); + } + + // Restore work vector + CeedCallBackend(CeedRestoreWorkVector(ceed, &active_e_vec_in)); + CeedCallBackend(CeedRestoreWorkVector(ceed, &active_e_vec_out)); + CeedCallBackend(CeedDestroy(&ceed)); + CeedCallBackend(CeedQFunctionDestroy(&qf)); return CEED_ERROR_SUCCESS; } @@ -1249,14 +2082,35 @@ int CeedOperatorCreate_Hip(CeedOperator op) { CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); CeedCallBackend(CeedCalloc(1, &impl)); CeedCallBackend(CeedOperatorSetData(op, impl)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunction", CeedOperatorLinearAssembleQFunction_Hip)); CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunctionUpdate", CeedOperatorLinearAssembleQFunctionUpdate_Hip)); CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleAddDiagonal", CeedOperatorLinearAssembleAddDiagonal_Hip)); - CeedCallBackend( - CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleAddPointBlockDiagonal", CeedOperatorLinearAssembleAddPointBlockDiagonal_Hip)); - CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleSingle", CeedSingleOperatorAssemble_Hip)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleAddPointBlockDiagonal", + CeedOperatorLinearAssembleAddPointBlockDiagonal_Hip)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleSingle", CeedOperatorAssembleSingle_Hip)); CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "ApplyAdd", CeedOperatorApplyAdd_Hip)); CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "Destroy", CeedOperatorDestroy_Hip)); + CeedCallBackend(CeedDestroy(&ceed)); + return CEED_ERROR_SUCCESS; +} + +//------------------------------------------------------------------------------ +// Create operator AtPoints +//------------------------------------------------------------------------------ +int CeedOperatorCreateAtPoints_Hip(CeedOperator op) { + Ceed ceed; + CeedOperator_Hip *impl; + + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); + CeedCallBackend(CeedCalloc(1, &impl)); + CeedCallBackend(CeedOperatorSetData(op, impl)); + + CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunction", CeedOperatorLinearAssembleQFunctionAtPoints_Hip)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleAddDiagonal", CeedOperatorLinearAssembleAddDiagonalAtPoints_Hip)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "ApplyAdd", CeedOperatorApplyAddAtPoints_Hip)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "Destroy", CeedOperatorDestroy_Hip)); + CeedCallBackend(CeedDestroy(&ceed)); return CEED_ERROR_SUCCESS; } diff --git a/backends/hip-ref/ceed-hip-ref-qfunction-load.cpp b/backends/hip-ref/ceed-hip-ref-qfunction-load.cpp index 222a94fc85..bf938eacc4 100644 --- a/backends/hip-ref/ceed-hip-ref-qfunction-load.cpp +++ b/backends/hip-ref/ceed-hip-ref-qfunction-load.cpp @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -25,40 +25,38 @@ extern "C" int CeedQFunctionBuildKernel_Hip_ref(CeedQFunction qf) { using std::string; Ceed ceed; - char *read_write_kernel_source; - const char *read_write_kernel_path; Ceed_Hip *ceed_Hip; CeedInt num_input_fields, num_output_fields, size; CeedQFunctionField *input_fields, *output_fields; CeedQFunction_Hip *data; - CeedCallBackend(CeedQFunctionGetCeed(qf, &ceed)); - CeedCallBackend(CeedGetData(ceed, &ceed_Hip)); - CeedCallBackend(CeedQFunctionGetData(qf, (void **)&data)); - // QFunction is built + CeedCallBackend(CeedQFunctionGetData(qf, (void **)&data)); if (data->QFunction) return CEED_ERROR_SUCCESS; - CeedCheck(data->qfunction_source, ceed, CEED_ERROR_BACKEND, "No QFunction source or hipFunction_t provided."); + CeedCallBackend(CeedQFunctionGetCeed(qf, &ceed)); + CeedCallBackend(CeedGetData(ceed, &ceed_Hip)); // QFunction kernel generation CeedCallBackend(CeedQFunctionGetFields(qf, &num_input_fields, &input_fields, &num_output_fields, &output_fields)); // Build strings for final kernel - CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-ref-qfunction.h", &read_write_kernel_path)); - CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading QFunction Read/Write Kernel Source -----\n"); - CeedCallBackend(CeedLoadSourceToBuffer(ceed, read_write_kernel_path, &read_write_kernel_source)); - CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading QFunction Read/Write Kernel Source Complete! -----\n"); - string qfunction_source(data->qfunction_source); string qfunction_name(data->qfunction_name); - string read_write(read_write_kernel_source); string kernel_name = "CeedKernelHipRefQFunction_" + qfunction_name; ostringstream code; - // Defintions - code << read_write; - code << qfunction_source; - code << "\n"; + // Definitions + code << "// QFunction source\n"; + code << "#include \n\n"; + { + const char *source_path; + + CeedCallBackend(CeedQFunctionGetSourcePath(qf, &source_path)); + CeedCheck(source_path, ceed, CEED_ERROR_BACKEND, "No QFunction source or hipFunction_t provided."); + + code << "// User QFunction source\n"; + code << "#include \"" << source_path << "\"\n\n"; + } code << "extern \"C\" __launch_bounds__(BLOCK_SIZE)\n"; code << "__global__ void " << kernel_name << "(void *ctx, CeedInt Q, Fields_Hip fields) {\n"; @@ -69,7 +67,7 @@ extern "C" int CeedQFunctionBuildKernel_Hip_ref(CeedQFunction qf) { code << " const CeedInt size_input_" << i << " = " << size << ";\n"; code << " CeedScalar input_" << i << "[size_input_" << i << "];\n"; } - code << " const CeedScalar* inputs[" << num_input_fields << "];\n"; + code << " const CeedScalar *inputs[" << CeedIntMax(num_input_fields, 1) << "];\n"; for (CeedInt i = 0; i < num_input_fields; i++) { code << " inputs[" << i << "] = input_" << i << ";\n"; } @@ -82,7 +80,7 @@ extern "C" int CeedQFunctionBuildKernel_Hip_ref(CeedQFunction qf) { code << " const CeedInt size_output_" << i << " = " << size << ";\n"; code << " CeedScalar output_" << i << "[size_output_" << i << "];\n"; } - code << " CeedScalar* outputs[" << num_output_fields << "];\n"; + code << " CeedScalar *outputs[" << CeedIntMax(num_output_fields, 1) << "];\n"; for (CeedInt i = 0; i < num_output_fields; i++) { code << " outputs[" << i << "] = output_" << i << ";\n"; } @@ -111,18 +109,10 @@ extern "C" int CeedQFunctionBuildKernel_Hip_ref(CeedQFunction qf) { code << " }\n"; code << "}\n"; - // View kernel for debugging - CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "Generated QFunction Kernels:\n"); - CeedDebug(ceed, code.str().c_str()); - // Compile kernel CeedCallBackend(CeedCompile_Hip(ceed, code.str().c_str(), &data->module, 1, "BLOCK_SIZE", ceed_Hip->opt_block_size)); CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, kernel_name.c_str(), &data->QFunction)); - - // Cleanup - CeedCallBackend(CeedFree(&data->qfunction_source)); - CeedCallBackend(CeedFree(&read_write_kernel_path)); - CeedCallBackend(CeedFree(&read_write_kernel_source)); + CeedCallBackend(CeedDestroy(&ceed)); return CEED_ERROR_SUCCESS; } diff --git a/backends/hip-ref/ceed-hip-ref-qfunction-load.h b/backends/hip-ref/ceed-hip-ref-qfunction-load.h index dc83256d83..5fc7073046 100644 --- a/backends/hip-ref/ceed-hip-ref-qfunction-load.h +++ b/backends/hip-ref/ceed-hip-ref-qfunction-load.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause diff --git a/backends/hip-ref/ceed-hip-ref-qfunction.c b/backends/hip-ref/ceed-hip-ref-qfunction.c index e5e72cfd43..60dd757ee7 100644 --- a/backends/hip-ref/ceed-hip-ref-qfunction.c +++ b/backends/hip-ref/ceed-hip-ref-qfunction.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -60,6 +60,7 @@ static int CeedQFunctionApply_Hip(CeedQFunction qf, CeedInt Q, CeedVector *U, Ce // Restore context CeedCallBackend(CeedQFunctionRestoreInnerContextData(qf, &data->d_c)); + CeedCallBackend(CeedDestroy(&ceed)); return CEED_ERROR_SUCCESS; } @@ -88,15 +89,12 @@ int CeedQFunctionCreate_Hip(CeedQFunction qf) { CeedCallBackend(CeedQFunctionSetData(qf, data)); CeedCallBackend(CeedQFunctionGetNumArgs(qf, &num_input_fields, &num_output_fields)); - // Read QFunction source CeedCallBackend(CeedQFunctionGetKernelName(qf, &data->qfunction_name)); - CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading QFunction User Source -----\n"); - CeedCallBackend(CeedQFunctionLoadSourceToBuffer(qf, &data->qfunction_source)); - CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading QFunction User Source Complete! -----\n"); // Register backend functions CeedCallBackend(CeedSetBackendFunction(ceed, "QFunction", qf, "Apply", CeedQFunctionApply_Hip)); CeedCallBackend(CeedSetBackendFunction(ceed, "QFunction", qf, "Destroy", CeedQFunctionDestroy_Hip)); + CeedCallBackend(CeedDestroy(&ceed)); return CEED_ERROR_SUCCESS; } diff --git a/backends/hip-ref/ceed-hip-ref-qfunctioncontext.c b/backends/hip-ref/ceed-hip-ref-qfunctioncontext.c index 52bf13370b..a223fa91d8 100644 --- a/backends/hip-ref/ceed-hip-ref-qfunctioncontext.c +++ b/backends/hip-ref/ceed-hip-ref-qfunctioncontext.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -37,6 +37,7 @@ static inline int CeedQFunctionContextSyncH2D_Hip(const CeedQFunctionContext ctx impl->d_data = impl->d_data_owned; } CeedCallHip(ceed, hipMemcpy(impl->d_data, impl->h_data, ctx_size, hipMemcpyHostToDevice)); + CeedCallBackend(CeedDestroy(&ceed)); return CEED_ERROR_SUCCESS; } @@ -63,6 +64,7 @@ static inline int CeedQFunctionContextSyncD2H_Hip(const CeedQFunctionContext ctx impl->h_data = impl->h_data_owned; } CeedCallHip(ceed, hipMemcpy(impl->h_data, impl->d_data, ctx_size, hipMemcpyDeviceToHost)); + CeedCallBackend(CeedDestroy(&ceed)); return CEED_ERROR_SUCCESS; } @@ -76,7 +78,9 @@ static inline int CeedQFunctionContextSync_Hip(const CeedQFunctionContext ctx, C case CEED_MEM_DEVICE: return CeedQFunctionContextSyncH2D_Hip(ctx); } + // LCOV_EXCL_START return CEED_ERROR_UNSUPPORTED; + // LCOV_EXCL_STOP } //------------------------------------------------------------------------------ @@ -204,6 +208,7 @@ static int CeedQFunctionContextSetDataDevice_Hip(const CeedQFunctionContext ctx, impl->d_data = data; break; } + CeedCallBackend(CeedDestroy(&ceed)); return CEED_ERROR_SUCCESS; } @@ -219,7 +224,9 @@ static int CeedQFunctionContextSetData_Hip(const CeedQFunctionContext ctx, const case CEED_MEM_DEVICE: return CeedQFunctionContextSetDataDevice_Hip(ctx, copy_mode, data); } + // LCOV_EXCL_START return CEED_ERROR_UNSUPPORTED; + // LCOV_EXCL_STOP } //------------------------------------------------------------------------------ @@ -334,6 +341,7 @@ int CeedQFunctionContextCreate_Hip(CeedQFunctionContext ctx) { CeedCallBackend(CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "GetData", CeedQFunctionContextGetData_Hip)); CeedCallBackend(CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "GetDataRead", CeedQFunctionContextGetDataRead_Hip)); CeedCallBackend(CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "Destroy", CeedQFunctionContextDestroy_Hip)); + CeedCallBackend(CeedDestroy(&ceed)); CeedCallBackend(CeedCalloc(1, &impl)); CeedCallBackend(CeedQFunctionContextSetBackendData(ctx, impl)); return CEED_ERROR_SUCCESS; diff --git a/backends/hip-ref/ceed-hip-ref-restriction.c b/backends/hip-ref/ceed-hip-ref-restriction.c index 625430f9d7..b1cd8b5c06 100644 --- a/backends/hip-ref/ceed-hip-ref-restriction.c +++ b/backends/hip-ref/ceed-hip-ref-restriction.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -23,36 +23,34 @@ static inline int CeedElemRestrictionSetupCompile_Hip(CeedElemRestriction rstr) { Ceed ceed; bool is_deterministic; - char *restriction_kernel_source; - const char *restriction_kernel_path; CeedInt num_elem, num_comp, elem_size, comp_stride; CeedRestrictionType rstr_type; CeedElemRestriction_Hip *impl; CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); CeedCallBackend(CeedElemRestrictionGetCeed(rstr, &ceed)); + CeedCallBackend(CeedElemRestrictionGetType(rstr, &rstr_type)); CeedCallBackend(CeedElemRestrictionGetNumElements(rstr, &num_elem)); CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr, &num_comp)); - CeedCallBackend(CeedElemRestrictionGetElementSize(rstr, &elem_size)); CeedCallBackend(CeedElemRestrictionGetCompStride(rstr, &comp_stride)); - CeedCallBackend(CeedElemRestrictionGetType(rstr, &rstr_type)); + if (rstr_type == CEED_RESTRICTION_POINTS) { + CeedCallBackend(CeedElemRestrictionGetMaxPointsInElement(rstr, &elem_size)); + } else { + CeedCallBackend(CeedElemRestrictionGetElementSize(rstr, &elem_size)); + } is_deterministic = impl->d_l_vec_indices != NULL; // Compile HIP kernels switch (rstr_type) { case CEED_RESTRICTION_STRIDED: { - bool has_backend_strides; - CeedInt strides[3] = {1, num_elem * elem_size, elem_size}; + const char restriction_kernel_source[] = "// Strided restriction source\n#include \n"; + bool has_backend_strides; + CeedInt strides[3] = {1, num_elem * elem_size, elem_size}; CeedCallBackend(CeedElemRestrictionHasBackendStrides(rstr, &has_backend_strides)); if (!has_backend_strides) { CeedCallBackend(CeedElemRestrictionGetStrides(rstr, strides)); } - - CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-ref-restriction-strided.h", &restriction_kernel_path)); - CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source -----\n"); - CeedCallBackend(CeedLoadSourceToBuffer(ceed, restriction_kernel_path, &restriction_kernel_source)); - CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source Complete! -----\n"); CeedCallBackend(CeedCompile_Hip(ceed, restriction_kernel_source, &impl->module, 6, "RSTR_ELEM_SIZE", elem_size, "RSTR_NUM_ELEM", num_elem, "RSTR_NUM_COMP", num_comp, "RSTR_STRIDE_NODES", strides[0], "RSTR_STRIDE_COMP", strides[1], "RSTR_STRIDE_ELEM", strides[2])); @@ -60,27 +58,30 @@ static inline int CeedElemRestrictionSetupCompile_Hip(CeedElemRestriction rstr) CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "StridedTranspose", &impl->ApplyTranspose)); } break; case CEED_RESTRICTION_STANDARD: { - CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-ref-restriction-offset.h", &restriction_kernel_path)); - CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source -----\n"); - CeedCallBackend(CeedLoadSourceToBuffer(ceed, restriction_kernel_path, &restriction_kernel_source)); - CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source Complete! -----\n"); + const char restriction_kernel_source[] = "// Standard restriction source\n#include \n"; + CeedCallBackend(CeedCompile_Hip(ceed, restriction_kernel_source, &impl->module, 6, "RSTR_ELEM_SIZE", elem_size, "RSTR_NUM_ELEM", num_elem, "RSTR_NUM_COMP", num_comp, "RSTR_NUM_NODES", impl->num_nodes, "RSTR_COMP_STRIDE", comp_stride, "USE_DETERMINISTIC", is_deterministic ? 1 : 0)); CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "OffsetNoTranspose", &impl->ApplyNoTranspose)); CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "OffsetTranspose", &impl->ApplyTranspose)); } break; + case CEED_RESTRICTION_POINTS: { + const char restriction_kernel_source[] = + "// AtPoints restriction source\n#include \n\n" + "// Standard restriction source\n#include \n"; + + CeedCallBackend(CeedCompile_Hip(ceed, restriction_kernel_source, &impl->module, 6, "RSTR_ELEM_SIZE", elem_size, "RSTR_NUM_ELEM", num_elem, + "RSTR_NUM_COMP", num_comp, "RSTR_NUM_NODES", impl->num_nodes, "RSTR_COMP_STRIDE", comp_stride, + "USE_DETERMINISTIC", is_deterministic ? 1 : 0)); + CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "OffsetNoTranspose", &impl->ApplyNoTranspose)); + CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "AtPointsTranspose", &impl->ApplyTranspose)); + } break; case CEED_RESTRICTION_ORIENTED: { - const char *offset_kernel_path; - char **file_paths = NULL; - CeedInt num_file_paths = 0; - - CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-ref-restriction-oriented.h", &restriction_kernel_path)); - CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source -----\n"); - CeedCallBackend(CeedLoadSourceAndInitializeBuffer(ceed, restriction_kernel_path, &num_file_paths, &file_paths, &restriction_kernel_source)); - CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-ref-restriction-offset.h", &offset_kernel_path)); - CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, offset_kernel_path, &num_file_paths, &file_paths, &restriction_kernel_source)); - CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source Complete! -----\n"); + const char restriction_kernel_source[] = + "// Oriented restriction source\n#include \n\n" + "// Standard restriction source\n#include \n"; + CeedCallBackend(CeedCompile_Hip(ceed, restriction_kernel_source, &impl->module, 6, "RSTR_ELEM_SIZE", elem_size, "RSTR_NUM_ELEM", num_elem, "RSTR_NUM_COMP", num_comp, "RSTR_NUM_NODES", impl->num_nodes, "RSTR_COMP_STRIDE", comp_stride, "USE_DETERMINISTIC", is_deterministic ? 1 : 0)); @@ -88,22 +89,12 @@ static inline int CeedElemRestrictionSetupCompile_Hip(CeedElemRestriction rstr) CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "OffsetNoTranspose", &impl->ApplyUnsignedNoTranspose)); CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "OrientedTranspose", &impl->ApplyTranspose)); CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "OffsetTranspose", &impl->ApplyUnsignedTranspose)); - // Cleanup - CeedCallBackend(CeedFree(&offset_kernel_path)); - for (CeedInt i = 0; i < num_file_paths; i++) CeedCall(CeedFree(&file_paths[i])); - CeedCall(CeedFree(&file_paths)); } break; case CEED_RESTRICTION_CURL_ORIENTED: { - const char *offset_kernel_path; - char **file_paths = NULL; - CeedInt num_file_paths = 0; - - CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-ref-restriction-curl-oriented.h", &restriction_kernel_path)); - CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source -----\n"); - CeedCallBackend(CeedLoadSourceAndInitializeBuffer(ceed, restriction_kernel_path, &num_file_paths, &file_paths, &restriction_kernel_source)); - CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-ref-restriction-offset.h", &offset_kernel_path)); - CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, offset_kernel_path, &num_file_paths, &file_paths, &restriction_kernel_source)); - CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source Complete! -----\n"); + const char restriction_kernel_source[] = + "// Curl oriented restriction source\n#include \n\n" + "// Standard restriction source\n#include \n"; + CeedCallBackend(CeedCompile_Hip(ceed, restriction_kernel_source, &impl->module, 6, "RSTR_ELEM_SIZE", elem_size, "RSTR_NUM_ELEM", num_elem, "RSTR_NUM_COMP", num_comp, "RSTR_NUM_NODES", impl->num_nodes, "RSTR_COMP_STRIDE", comp_stride, "USE_DETERMINISTIC", is_deterministic ? 1 : 0)); @@ -113,19 +104,10 @@ static inline int CeedElemRestrictionSetupCompile_Hip(CeedElemRestriction rstr) CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "CurlOrientedTranspose", &impl->ApplyTranspose)); CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "CurlOrientedUnsignedTranspose", &impl->ApplyUnsignedTranspose)); CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "OffsetTranspose", &impl->ApplyUnorientedTranspose)); - // Cleanup - CeedCallBackend(CeedFree(&offset_kernel_path)); - for (CeedInt i = 0; i < num_file_paths; i++) CeedCall(CeedFree(&file_paths[i])); - CeedCall(CeedFree(&file_paths)); - } break; - case CEED_RESTRICTION_POINTS: { - // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement restriction CeedElemRestrictionAtPoints"); - // LCOV_EXCL_STOP + } break; } - CeedCallBackend(CeedFree(&restriction_kernel_path)); - CeedCallBackend(CeedFree(&restriction_kernel_source)); + CeedCallBackend(CeedDestroy(&ceed)); return CEED_ERROR_SUCCESS; } @@ -174,6 +156,7 @@ static inline int CeedElemRestrictionApply_Hip_Core(CeedElemRestriction rstr, Ce CeedCallBackend(CeedRunKernel_Hip(ceed, impl->ApplyNoTranspose, grid, block_size, args)); } break; + case CEED_RESTRICTION_POINTS: case CEED_RESTRICTION_STANDARD: { void *args[] = {&impl->d_offsets, &d_u, &d_v}; @@ -205,11 +188,6 @@ static inline int CeedElemRestrictionApply_Hip_Core(CeedElemRestriction rstr, Ce CeedCallBackend(CeedRunKernel_Hip(ceed, impl->ApplyUnorientedNoTranspose, grid, block_size, args)); } } break; - case CEED_RESTRICTION_POINTS: { - // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement restriction CeedElemRestrictionAtPoints"); - // LCOV_EXCL_STOP - } break; } } else { // E-vector -> L-vector @@ -223,6 +201,17 @@ static inline int CeedElemRestrictionApply_Hip_Core(CeedElemRestriction rstr, Ce CeedCallBackend(CeedRunKernel_Hip(ceed, impl->ApplyTranspose, grid, block_size, args)); } break; + case CEED_RESTRICTION_POINTS: { + if (!is_deterministic) { + void *args[] = {&impl->d_offsets, &impl->d_points_per_elem, &d_u, &d_v}; + + CeedCallBackend(CeedRunKernel_Hip(ceed, impl->ApplyTranspose, grid, block_size, args)); + } else { + void *args[] = {&impl->d_l_vec_indices, &impl->d_t_indices, &impl->d_points_per_elem, &impl->d_t_offsets, &d_u, &d_v}; + + CeedCallBackend(CeedRunKernel_Hip(ceed, impl->ApplyTranspose, grid, block_size, args)); + } + } break; case CEED_RESTRICTION_STANDARD: { if (!is_deterministic) { void *args[] = {&impl->d_offsets, &d_u, &d_v}; @@ -290,11 +279,6 @@ static inline int CeedElemRestrictionApply_Hip_Core(CeedElemRestriction rstr, Ce } } } break; - case CEED_RESTRICTION_POINTS: { - // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement restriction CeedElemRestrictionAtPoints"); - // LCOV_EXCL_STOP - } break; } } @@ -303,6 +287,7 @@ static inline int CeedElemRestrictionApply_Hip_Core(CeedElemRestriction rstr, Ce // Restore arrays CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); + CeedCallBackend(CeedDestroy(&ceed)); return CEED_ERROR_SUCCESS; } @@ -334,14 +319,16 @@ static int CeedElemRestrictionApplyUnoriented_Hip(CeedElemRestriction rstr, Ceed //------------------------------------------------------------------------------ static int CeedElemRestrictionGetOffsets_Hip(CeedElemRestriction rstr, CeedMemType mem_type, const CeedInt **offsets) { CeedElemRestriction_Hip *impl; + CeedRestrictionType rstr_type; CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); + CeedCallBackend(CeedElemRestrictionGetType(rstr, &rstr_type)); switch (mem_type) { case CEED_MEM_HOST: - *offsets = impl->h_offsets; + *offsets = rstr_type == CEED_RESTRICTION_POINTS ? impl->h_offsets_at_points : impl->h_offsets; break; case CEED_MEM_DEVICE: - *offsets = impl->d_offsets; + *offsets = rstr_type == CEED_RESTRICTION_POINTS ? impl->d_offsets_at_points : impl->d_offsets; break; } return CEED_ERROR_SUCCESS; @@ -383,6 +370,17 @@ static int CeedElemRestrictionGetCurlOrientations_Hip(CeedElemRestriction rstr, return CEED_ERROR_SUCCESS; } +//------------------------------------------------------------------------------ +// Get offset for padded AtPoints E-layout +//------------------------------------------------------------------------------ +static int CeedElemRestrictionGetAtPointsElementOffset_Hip(CeedElemRestriction rstr, CeedInt elem, CeedSize *elem_offset) { + CeedInt layout[3]; + + CeedCallBackend(CeedElemRestrictionGetELayout(rstr, layout)); + *elem_offset = 0 * layout[0] + 0 * layout[1] + elem * layout[2]; + return CEED_ERROR_SUCCESS; +} + //------------------------------------------------------------------------------ // Destroy restriction //------------------------------------------------------------------------------ @@ -404,25 +402,31 @@ static int CeedElemRestrictionDestroy_Hip(CeedElemRestriction rstr) { CeedCallHip(ceed, hipFree((bool *)impl->d_orients_owned)); CeedCallBackend(CeedFree(&impl->h_curl_orients_owned)); CeedCallHip(ceed, hipFree((CeedInt8 *)impl->d_curl_orients_owned)); + CeedCallBackend(CeedFree(&impl->h_offsets_at_points_owned)); + CeedCallHip(ceed, hipFree((CeedInt8 *)impl->d_offsets_at_points_owned)); + CeedCallBackend(CeedFree(&impl->h_points_per_elem_owned)); + CeedCallHip(ceed, hipFree((CeedInt *)impl->d_points_per_elem_owned)); CeedCallBackend(CeedFree(&impl)); + CeedCallBackend(CeedDestroy(&ceed)); return CEED_ERROR_SUCCESS; } //------------------------------------------------------------------------------ // Create transpose offsets and indices //------------------------------------------------------------------------------ -static int CeedElemRestrictionOffset_Hip(const CeedElemRestriction rstr, const CeedInt *indices) { +static int CeedElemRestrictionOffset_Hip(const CeedElemRestriction rstr, const CeedInt elem_size, const CeedInt *indices) { Ceed ceed; bool *is_node; CeedSize l_size; - CeedInt num_elem, elem_size, num_comp, num_nodes = 0; + CeedInt num_elem, num_comp, num_nodes = 0; CeedInt *ind_to_offset, *l_vec_indices, *t_offsets, *t_indices; + CeedRestrictionType rstr_type; CeedElemRestriction_Hip *impl; CeedCallBackend(CeedElemRestrictionGetCeed(rstr, &ceed)); CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); CeedCallBackend(CeedElemRestrictionGetNumElements(rstr, &num_elem)); - CeedCallBackend(CeedElemRestrictionGetElementSize(rstr, &elem_size)); + CeedCallBackend(CeedElemRestrictionGetType(rstr, &rstr_type)); CeedCallBackend(CeedElemRestrictionGetLVectorSize(rstr, &l_size)); CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr, &num_comp)); const CeedInt size_indices = num_elem * elem_size; @@ -485,6 +489,7 @@ static int CeedElemRestrictionOffset_Hip(const CeedElemRestriction rstr, const C CeedCallBackend(CeedFree(&l_vec_indices)); CeedCallBackend(CeedFree(&t_offsets)); CeedCallBackend(CeedFree(&t_indices)); + CeedCallBackend(CeedDestroy(&ceed)); return CEED_ERROR_SUCCESS; } @@ -495,16 +500,27 @@ int CeedElemRestrictionCreate_Hip(CeedMemType mem_type, CeedCopyMode copy_mode, const CeedInt8 *curl_orients, CeedElemRestriction rstr) { Ceed ceed, ceed_parent; bool is_deterministic; - CeedInt num_elem, elem_size; + CeedInt num_elem, num_comp, elem_size; CeedRestrictionType rstr_type; CeedElemRestriction_Hip *impl; CeedCallBackend(CeedElemRestrictionGetCeed(rstr, &ceed)); CeedCallBackend(CeedGetParent(ceed, &ceed_parent)); CeedCallBackend(CeedIsDeterministic(ceed_parent, &is_deterministic)); + CeedCallBackend(CeedDestroy(&ceed_parent)); CeedCallBackend(CeedElemRestrictionGetNumElements(rstr, &num_elem)); + CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr, &num_comp)); CeedCallBackend(CeedElemRestrictionGetElementSize(rstr, &elem_size)); CeedCallBackend(CeedElemRestrictionGetType(rstr, &rstr_type)); + // Use max number of points as elem size for AtPoints restrictions + if (rstr_type == CEED_RESTRICTION_POINTS) { + CeedInt max_points = 0; + + for (CeedInt i = 0; i < num_elem; i++) { + max_points = CeedIntMax(max_points, offsets[i + 1] - offsets[i]); + } + elem_size = max_points; + } const CeedInt size = num_elem * elem_size; CeedCallBackend(CeedCalloc(1, &impl)); @@ -525,6 +541,51 @@ int CeedElemRestrictionCreate_Hip(CeedMemType mem_type, CeedCopyMode copy_mode, } } + // Pad AtPoints indices + if (rstr_type == CEED_RESTRICTION_POINTS) { + CeedSize offsets_len = elem_size * num_elem, at_points_size = num_elem + 1; + CeedInt max_points = elem_size, *offsets_padded, *points_per_elem; + + CeedCheck(mem_type == CEED_MEM_HOST, ceed, CEED_ERROR_BACKEND, "only MemType Host supported when creating AtPoints restriction"); + CeedCallBackend(CeedMalloc(offsets_len, &offsets_padded)); + CeedCallBackend(CeedMalloc(num_elem, &points_per_elem)); + for (CeedInt i = 0; i < num_elem; i++) { + CeedInt num_points = offsets[i + 1] - offsets[i]; + CeedInt last_point = 0; + + points_per_elem[i] = num_points; + at_points_size += num_points; + // -- Copy all points in element + for (CeedInt j = 0; j < num_points; j++) { + offsets_padded[i * max_points + j] = offsets[offsets[i] + j] * num_comp; + last_point = offsets_padded[i * max_points + j]; + } + // -- Replicate out last point in element + for (CeedInt j = num_points; j < max_points; j++) { + offsets_padded[i * max_points + j] = last_point; + } + } + CeedCallBackend(CeedSetHostCeedIntArray(offsets, copy_mode, at_points_size, &impl->h_offsets_at_points_owned, &impl->h_offsets_at_points_borrowed, + &impl->h_offsets_at_points)); + CeedCallHip(ceed, hipMalloc((void **)&impl->d_offsets_at_points_owned, at_points_size * sizeof(CeedInt))); + CeedCallHip(ceed, hipMemcpy((CeedInt **)impl->d_offsets_at_points_owned, impl->h_offsets_at_points, at_points_size * sizeof(CeedInt), + hipMemcpyHostToDevice)); + impl->d_offsets_at_points = (CeedInt *)impl->d_offsets_at_points_owned; + + // -- Use padded offsets for the rest of the setup + offsets = (const CeedInt *)offsets_padded; + copy_mode = CEED_OWN_POINTER; + CeedCallBackend(CeedElemRestrictionSetAtPointsEVectorSize(rstr, elem_size * num_elem * num_comp)); + + // -- Points per element + CeedCallBackend(CeedSetHostCeedIntArray(points_per_elem, CEED_OWN_POINTER, num_elem, &impl->h_points_per_elem_owned, + &impl->h_points_per_elem_borrowed, &impl->h_points_per_elem)); + CeedCallHip(ceed, hipMalloc((void **)&impl->d_points_per_elem_owned, num_elem * sizeof(CeedInt))); + CeedCallHip(ceed, + hipMemcpy((CeedInt **)impl->d_points_per_elem_owned, impl->h_points_per_elem, num_elem * sizeof(CeedInt), hipMemcpyHostToDevice)); + impl->d_points_per_elem = (CeedInt *)impl->d_points_per_elem_owned; + } + // Set up device offset/orientation arrays if (rstr_type != CEED_RESTRICTION_STRIDED) { switch (mem_type) { @@ -533,7 +594,7 @@ int CeedElemRestrictionCreate_Hip(CeedMemType mem_type, CeedCopyMode copy_mode, CeedCallHip(ceed, hipMalloc((void **)&impl->d_offsets_owned, size * sizeof(CeedInt))); CeedCallHip(ceed, hipMemcpy((CeedInt **)impl->d_offsets_owned, impl->h_offsets, size * sizeof(CeedInt), hipMemcpyHostToDevice)); impl->d_offsets = (CeedInt *)impl->d_offsets_owned; - if (is_deterministic) CeedCallBackend(CeedElemRestrictionOffset_Hip(rstr, offsets)); + if (is_deterministic) CeedCallBackend(CeedElemRestrictionOffset_Hip(rstr, elem_size, offsets)); } break; case CEED_MEM_DEVICE: { CeedCallBackend(CeedSetDeviceCeedIntArray_Hip(ceed, offsets, copy_mode, size, &impl->d_offsets_owned, &impl->d_offsets_borrowed, @@ -541,7 +602,7 @@ int CeedElemRestrictionCreate_Hip(CeedMemType mem_type, CeedCopyMode copy_mode, CeedCallBackend(CeedMalloc(size, &impl->h_offsets_owned)); CeedCallHip(ceed, hipMemcpy((CeedInt **)impl->h_offsets_owned, impl->d_offsets, size * sizeof(CeedInt), hipMemcpyDeviceToHost)); impl->h_offsets = impl->h_offsets_owned; - if (is_deterministic) CeedCallBackend(CeedElemRestrictionOffset_Hip(rstr, offsets)); + if (is_deterministic) CeedCallBackend(CeedElemRestrictionOffset_Hip(rstr, elem_size, offsets)); } break; } @@ -591,7 +652,12 @@ int CeedElemRestrictionCreate_Hip(CeedMemType mem_type, CeedCopyMode copy_mode, CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "GetOffsets", CeedElemRestrictionGetOffsets_Hip)); CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "GetOrientations", CeedElemRestrictionGetOrientations_Hip)); CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "GetCurlOrientations", CeedElemRestrictionGetCurlOrientations_Hip)); + if (rstr_type == CEED_RESTRICTION_POINTS) { + CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "GetAtPointsElementOffset", + CeedElemRestrictionGetAtPointsElementOffset_Hip)); + } CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "Destroy", CeedElemRestrictionDestroy_Hip)); + CeedCallBackend(CeedDestroy(&ceed)); return CEED_ERROR_SUCCESS; } diff --git a/backends/hip-ref/ceed-hip-ref-vector.c b/backends/hip-ref/ceed-hip-ref-vector.c index 5789679578..f1d1dcd93a 100644 --- a/backends/hip-ref/ceed-hip-ref-vector.c +++ b/backends/hip-ref/ceed-hip-ref-vector.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -39,15 +39,13 @@ static inline int CeedVectorNeedSync_Hip(const CeedVector vec, CeedMemType mem_t // Sync host to device //------------------------------------------------------------------------------ static inline int CeedVectorSyncH2D_Hip(const CeedVector vec) { - Ceed ceed; CeedSize length; size_t bytes; CeedVector_Hip *impl; - CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); CeedCallBackend(CeedVectorGetData(vec, &impl)); - CeedCheck(impl->h_array, ceed, CEED_ERROR_BACKEND, "No valid host data to sync to device"); + CeedCheck(impl->h_array, CeedVectorReturnCeed(vec), CEED_ERROR_BACKEND, "No valid host data to sync to device"); CeedCallBackend(CeedVectorGetLength(vec, &length)); bytes = length * sizeof(CeedScalar); @@ -56,10 +54,10 @@ static inline int CeedVectorSyncH2D_Hip(const CeedVector vec) { } else if (impl->d_array_owned) { impl->d_array = impl->d_array_owned; } else { - CeedCallHip(ceed, hipMalloc((void **)&impl->d_array_owned, bytes)); + CeedCallHip(CeedVectorReturnCeed(vec), hipMalloc((void **)&impl->d_array_owned, bytes)); impl->d_array = impl->d_array_owned; } - CeedCallHip(ceed, hipMemcpy(impl->d_array, impl->h_array, bytes, hipMemcpyHostToDevice)); + CeedCallHip(CeedVectorReturnCeed(vec), hipMemcpy(impl->d_array, impl->h_array, bytes, hipMemcpyHostToDevice)); return CEED_ERROR_SUCCESS; } @@ -67,15 +65,13 @@ static inline int CeedVectorSyncH2D_Hip(const CeedVector vec) { // Sync device to host //------------------------------------------------------------------------------ static inline int CeedVectorSyncD2H_Hip(const CeedVector vec) { - Ceed ceed; CeedSize length; size_t bytes; CeedVector_Hip *impl; - CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); CeedCallBackend(CeedVectorGetData(vec, &impl)); - CeedCheck(impl->d_array, ceed, CEED_ERROR_BACKEND, "No valid device data to sync to host"); + CeedCheck(impl->d_array, CeedVectorReturnCeed(vec), CEED_ERROR_BACKEND, "No valid device data to sync to host"); if (impl->h_array_borrowed) { impl->h_array = impl->h_array_borrowed; @@ -91,7 +87,7 @@ static inline int CeedVectorSyncD2H_Hip(const CeedVector vec) { CeedCallBackend(CeedVectorGetLength(vec, &length)); bytes = length * sizeof(CeedScalar); - CeedCallHip(ceed, hipMemcpy(impl->h_array, impl->d_array, bytes, hipMemcpyDeviceToHost)); + CeedCallHip(CeedVectorReturnCeed(vec), hipMemcpy(impl->h_array, impl->d_array, bytes, hipMemcpyDeviceToHost)); return CEED_ERROR_SUCCESS; } @@ -99,7 +95,15 @@ static inline int CeedVectorSyncD2H_Hip(const CeedVector vec) { // Sync arrays //------------------------------------------------------------------------------ static int CeedVectorSyncArray_Hip(const CeedVector vec, CeedMemType mem_type) { - bool need_sync = false; + bool need_sync = false; + CeedVector_Hip *impl; + + // Sync for unified memory + CeedCallBackend(CeedVectorGetData(vec, &impl)); + if (impl->has_unified_addressing && !impl->h_array_borrowed) { + CeedCallHip(CeedVectorReturnCeed(vec), hipDeviceSynchronize()); + return CEED_ERROR_SUCCESS; + } // Check whether device/host sync is needed CeedCallBackend(CeedVectorNeedSync_Hip(vec, mem_type, &need_sync)); @@ -111,7 +115,9 @@ static int CeedVectorSyncArray_Hip(const CeedVector vec, CeedMemType mem_type) { case CEED_MEM_DEVICE: return CeedVectorSyncH2D_Hip(vec); } + // LCOV_EXCL_START return CEED_ERROR_UNSUPPORTED; + // LCOV_EXCL_STOP } //------------------------------------------------------------------------------ @@ -162,6 +168,10 @@ static inline int CeedVectorHasBorrowedArrayOfType_Hip(const CeedVector vec, Cee CeedVector_Hip *impl; CeedCallBackend(CeedVectorGetData(vec, &impl)); + + // Use device memory for unified memory + mem_type = impl->has_unified_addressing && !impl->h_array_borrowed ? CEED_MEM_DEVICE : mem_type; + switch (mem_type) { case CEED_MEM_HOST: *has_borrowed_array_of_type = impl->h_array_borrowed; @@ -202,6 +212,44 @@ static int CeedVectorSetArrayDevice_Hip(const CeedVector vec, const CeedCopyMode CeedCallBackend(CeedSetDeviceCeedScalarArray_Hip(ceed, array, copy_mode, length, (const CeedScalar **)&impl->d_array_owned, (const CeedScalar **)&impl->d_array_borrowed, (const CeedScalar **)&impl->d_array)); + CeedCallBackend(CeedDestroy(&ceed)); + return CEED_ERROR_SUCCESS; +} + +//------------------------------------------------------------------------------ +// Set array with unified memory +//------------------------------------------------------------------------------ +static int CeedVectorSetArrayUnifiedHostToDevice_Hip(const CeedVector vec, const CeedCopyMode copy_mode, CeedScalar *array) { + CeedSize length; + Ceed ceed; + CeedVector_Hip *impl; + + CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); + CeedCallBackend(CeedVectorGetData(vec, &impl)); + CeedCallBackend(CeedVectorGetLength(vec, &length)); + + switch (copy_mode) { + case CEED_COPY_VALUES: + case CEED_OWN_POINTER: + if (!impl->d_array) { + if (impl->d_array_borrowed) { + impl->d_array = impl->d_array_borrowed; + } else { + if (!impl->d_array_owned) CeedCallHip(ceed, hipMalloc((void **)&impl->d_array_owned, sizeof(CeedScalar) * length)); + impl->d_array = impl->d_array_owned; + } + } + if (array) CeedCallHip(ceed, hipMemcpy(impl->d_array, array, sizeof(CeedScalar) * length, hipMemcpyHostToDevice)); + if (copy_mode == CEED_OWN_POINTER) CeedCallBackend(CeedFree(&array)); + break; + case CEED_USE_POINTER: + CeedCallHip(ceed, hipFree(impl->d_array_owned)); + CeedCallBackend(CeedFree(&impl->h_array_owned)); + impl->h_array_owned = NULL; + impl->h_array_borrowed = array; + impl->d_array = impl->h_array_borrowed; + } + CeedCallBackend(CeedDestroy(&ceed)); return CEED_ERROR_SUCCESS; } @@ -216,11 +264,83 @@ static int CeedVectorSetArray_Hip(const CeedVector vec, const CeedMemType mem_ty CeedCallBackend(CeedVectorSetAllInvalid_Hip(vec)); switch (mem_type) { case CEED_MEM_HOST: - return CeedVectorSetArrayHost_Hip(vec, copy_mode, array); + if (impl->has_unified_addressing) { + return CeedVectorSetArrayUnifiedHostToDevice_Hip(vec, copy_mode, array); + } else { + return CeedVectorSetArrayHost_Hip(vec, copy_mode, array); + } case CEED_MEM_DEVICE: return CeedVectorSetArrayDevice_Hip(vec, copy_mode, array); } + // LCOV_EXCL_START return CEED_ERROR_UNSUPPORTED; + // LCOV_EXCL_STOP +} + +//------------------------------------------------------------------------------ +// Copy host array to value strided +//------------------------------------------------------------------------------ +static int CeedHostCopyStrided_Hip(CeedScalar *h_array, CeedSize start, CeedSize stop, CeedSize step, CeedScalar *h_copy_array) { + for (CeedSize i = start; i < stop; i += step) h_copy_array[i] = h_array[i]; + return CEED_ERROR_SUCCESS; +} + +//------------------------------------------------------------------------------ +// Copy device array to value strided (impl in .hip.cpp file) +//------------------------------------------------------------------------------ +int CeedDeviceCopyStrided_Hip(CeedScalar *d_array, CeedSize start, CeedSize stop, CeedSize step, CeedScalar *d_copy_array); + +//------------------------------------------------------------------------------ +// Copy a vector to a value strided +//------------------------------------------------------------------------------ +static int CeedVectorCopyStrided_Hip(CeedVector vec, CeedSize start, CeedSize stop, CeedSize step, CeedVector vec_copy) { + CeedSize length; + CeedVector_Hip *impl; + + CeedCallBackend(CeedVectorGetData(vec, &impl)); + { + CeedSize length_vec, length_copy; + + CeedCallBackend(CeedVectorGetLength(vec, &length_vec)); + CeedCallBackend(CeedVectorGetLength(vec_copy, &length_copy)); + length = length_vec < length_copy ? length_vec : length_copy; + } + if (stop == -1) stop = length; + // Set value for synced device/host array + if (impl->d_array) { + CeedScalar *copy_array; + Ceed ceed; + + CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); + CeedCallBackend(CeedVectorGetArray(vec_copy, CEED_MEM_DEVICE, ©_array)); +#if (HIP_VERSION >= 60000000) + hipblasHandle_t handle; + hipStream_t stream; + CeedCallBackend(CeedGetHipblasHandle_Hip(ceed, &handle)); + CeedCallHipblas(ceed, hipblasGetStream(handle, &stream)); +#if defined(CEED_SCALAR_IS_FP32) + CeedCallHipblas(ceed, hipblasScopy_64(handle, (int64_t)(stop - start), impl->d_array + start, (int64_t)step, copy_array + start, (int64_t)step)); +#else /* CEED_SCALAR */ + CeedCallHipblas(ceed, hipblasDcopy_64(handle, (int64_t)(stop - start), impl->d_array + start, (int64_t)step, copy_array + start, (int64_t)step)); +#endif /* CEED_SCALAR */ + CeedCallHip(ceed, hipStreamSynchronize(stream)); +#else /* HIP_VERSION */ + CeedCallBackend(CeedDeviceCopyStrided_Hip(impl->d_array, start, stop, step, copy_array)); +#endif /* HIP_VERSION */ + CeedCallBackend(CeedVectorRestoreArray(vec_copy, ©_array)); + impl->h_array = NULL; + CeedCallBackend(CeedDestroy(&ceed)); + } else if (impl->h_array) { + CeedScalar *copy_array; + + CeedCallBackend(CeedVectorGetArray(vec_copy, CEED_MEM_HOST, ©_array)); + CeedCallBackend(CeedHostCopyStrided_Hip(impl->h_array, start, stop, step, copy_array)); + CeedCallBackend(CeedVectorRestoreArray(vec_copy, ©_array)); + impl->d_array = NULL; + } else { + return CeedError(CeedVectorReturnCeed(vec), CEED_ERROR_BACKEND, "CeedVector must have valid data set"); + } + return CEED_ERROR_SUCCESS; } //------------------------------------------------------------------------------ @@ -242,8 +362,10 @@ int CeedDeviceSetValue_Hip(CeedScalar *d_array, CeedSize length, CeedScalar val) static int CeedVectorSetValue_Hip(CeedVector vec, CeedScalar val) { CeedSize length; CeedVector_Hip *impl; + Ceed_Hip *hip_data; CeedCallBackend(CeedVectorGetData(vec, &impl)); + CeedCallBackend(CeedGetData(CeedVectorReturnCeed(vec), &hip_data)); CeedCallBackend(CeedVectorGetLength(vec, &length)); // Set value for synced device/host array if (!impl->d_array && !impl->h_array) { @@ -260,16 +382,55 @@ static int CeedVectorSetValue_Hip(CeedVector vec, CeedScalar val) { } } if (impl->d_array) { - CeedCallBackend(CeedDeviceSetValue_Hip(impl->d_array, length, val)); + if (val == 0 && !impl->h_array_borrowed) { + CeedCallHip(CeedVectorReturnCeed(vec), hipMemset(impl->d_array, 0, length * sizeof(CeedScalar))); + } else { + CeedCallBackend(CeedDeviceSetValue_Hip(impl->d_array, length, val)); + } impl->h_array = NULL; - } - if (impl->h_array) { + } else if (impl->h_array) { CeedCallBackend(CeedHostSetValue_Hip(impl->h_array, length, val)); impl->d_array = NULL; } return CEED_ERROR_SUCCESS; } +//------------------------------------------------------------------------------ +// Set host array to value strided +//------------------------------------------------------------------------------ +static int CeedHostSetValueStrided_Hip(CeedScalar *h_array, CeedSize start, CeedSize stop, CeedSize step, CeedScalar val) { + for (CeedSize i = start; i < stop; i += step) h_array[i] = val; + return CEED_ERROR_SUCCESS; +} + +//------------------------------------------------------------------------------ +// Set device array to value strided (impl in .hip.cpp file) +//------------------------------------------------------------------------------ +int CeedDeviceSetValueStrided_Hip(CeedScalar *d_array, CeedSize start, CeedSize stop, CeedSize step, CeedScalar val); + +//------------------------------------------------------------------------------ +// Set a vector to a value strided +//------------------------------------------------------------------------------ +static int CeedVectorSetValueStrided_Hip(CeedVector vec, CeedSize start, CeedSize stop, CeedSize step, CeedScalar val) { + CeedSize length; + CeedVector_Hip *impl; + + CeedCallBackend(CeedVectorGetData(vec, &impl)); + CeedCallBackend(CeedVectorGetLength(vec, &length)); + // Set value for synced device/host array + if (stop == -1) stop = length; + if (impl->d_array) { + CeedCallBackend(CeedDeviceSetValueStrided_Hip(impl->d_array, start, stop, step, val)); + impl->h_array = NULL; + } else if (impl->h_array) { + CeedCallBackend(CeedHostSetValueStrided_Hip(impl->h_array, start, stop, step, val)); + impl->d_array = NULL; + } else { + return CeedError(CeedVectorReturnCeed(vec), CEED_ERROR_BACKEND, "CeedVector must have valid data set"); + } + return CEED_ERROR_SUCCESS; +} + //------------------------------------------------------------------------------ // Vector Take Array //------------------------------------------------------------------------------ @@ -298,14 +459,17 @@ static int CeedVectorTakeArray_Hip(CeedVector vec, CeedMemType mem_type, CeedSca } //------------------------------------------------------------------------------ -// Core logic for array syncronization for GetArray. +// Core logic for array synchronization for GetArray. // If a different memory type is most up to date, this will perform a copy //------------------------------------------------------------------------------ -static int CeedVectorGetArrayCore_Hip(const CeedVector vec, const CeedMemType mem_type, CeedScalar **array) { +static int CeedVectorGetArrayCore_Hip(const CeedVector vec, CeedMemType mem_type, CeedScalar **array) { CeedVector_Hip *impl; CeedCallBackend(CeedVectorGetData(vec, &impl)); + // Use device memory for unified memory + mem_type = impl->has_unified_addressing && !impl->h_array_borrowed ? CEED_MEM_DEVICE : mem_type; + // Sync array to requested mem_type CeedCallBackend(CeedVectorSyncArray(vec, mem_type)); @@ -331,15 +495,21 @@ static int CeedVectorGetArrayRead_Hip(const CeedVector vec, const CeedMemType me //------------------------------------------------------------------------------ // Get read/write access to a vector via the specified mem_type //------------------------------------------------------------------------------ -static int CeedVectorGetArray_Hip(const CeedVector vec, const CeedMemType mem_type, CeedScalar **array) { +static int CeedVectorGetArray_Hip(const CeedVector vec, CeedMemType mem_type, CeedScalar **array) { CeedVector_Hip *impl; CeedCallBackend(CeedVectorGetData(vec, &impl)); + + // Use device memory for unified memory + mem_type = impl->has_unified_addressing && !impl->h_array_borrowed ? CEED_MEM_DEVICE : mem_type; + + // 'Get' array and set only 'get'ed array as valid CeedCallBackend(CeedVectorGetArrayCore_Hip(vec, mem_type, array)); CeedCallBackend(CeedVectorSetAllInvalid_Hip(vec)); switch (mem_type) { case CEED_MEM_HOST: impl->h_array = *array; + if (impl->has_unified_addressing) impl->d_array = *array; break; case CEED_MEM_DEVICE: impl->d_array = *array; @@ -351,11 +521,17 @@ static int CeedVectorGetArray_Hip(const CeedVector vec, const CeedMemType mem_ty //------------------------------------------------------------------------------ // Get write access to a vector via the specified mem_type //------------------------------------------------------------------------------ -static int CeedVectorGetArrayWrite_Hip(const CeedVector vec, const CeedMemType mem_type, CeedScalar **array) { +static int CeedVectorGetArrayWrite_Hip(const CeedVector vec, CeedMemType mem_type, CeedScalar **array) { bool has_array_of_type = true; CeedVector_Hip *impl; + Ceed_Hip *hip_data; CeedCallBackend(CeedVectorGetData(vec, &impl)); + CeedCallBackend(CeedGetData(CeedVectorReturnCeed(vec), &hip_data)); + + // Use device memory for unified memory + mem_type = impl->has_unified_addressing && !impl->h_array_borrowed ? CEED_MEM_DEVICE : mem_type; + CeedCallBackend(CeedVectorHasArrayOfType_Hip(vec, mem_type, &has_array_of_type)); if (!has_array_of_type) { // Allocate if array is not yet allocated @@ -379,119 +555,191 @@ static int CeedVectorGetArrayWrite_Hip(const CeedVector vec, const CeedMemType m // Get the norm of a CeedVector //------------------------------------------------------------------------------ static int CeedVectorNorm_Hip(CeedVector vec, CeedNormType type, CeedScalar *norm) { - Ceed ceed; - CeedSize length, num_calls; + Ceed ceed; + CeedSize length; +#if (HIP_VERSION < 60000000) + CeedSize num_calls; +#endif /* HIP_VERSION */ const CeedScalar *d_array; CeedVector_Hip *impl; hipblasHandle_t handle; + hipStream_t stream; + Ceed_Hip *hip_data; CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); + CeedCallBackend(CeedGetData(ceed, &hip_data)); CeedCallBackend(CeedVectorGetData(vec, &impl)); CeedCallBackend(CeedVectorGetLength(vec, &length)); CeedCallBackend(CeedGetHipblasHandle_Hip(ceed, &handle)); - - // Is the vector too long to handle with int32? If so, we will divide - // it up into "int32-sized" subsections and make repeated BLAS calls. + CeedCallHipblas(ceed, hipblasGetStream(handle, &stream)); +#if (HIP_VERSION < 60000000) + // With ROCm 6, we can use the 64-bit integer interface. Prior to that, + // we need to check if the vector is too long to handle with int32, + // and if so, divide it into subsections for repeated hipBLAS calls. num_calls = length / INT_MAX; if (length % INT_MAX > 0) num_calls += 1; +#endif /* HIP_VERSION */ // Compute norm CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &d_array)); switch (type) { case CEED_NORM_1: { *norm = 0.0; - if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) { - float sub_norm = 0.0; - float *d_array_start; - - for (CeedInt i = 0; i < num_calls; i++) { - d_array_start = (float *)d_array + (CeedSize)(i)*INT_MAX; - CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX; - CeedInt sub_length = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX; - - CeedCallHipblas(ceed, hipblasSasum(handle, (CeedInt)sub_length, (float *)d_array_start, 1, &sub_norm)); - *norm += sub_norm; - } - } else { - double sub_norm = 0.0; - double *d_array_start; - - for (CeedInt i = 0; i < num_calls; i++) { - d_array_start = (double *)d_array + (CeedSize)(i)*INT_MAX; - CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX; - CeedInt sub_length = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX; - - CeedCallHipblas(ceed, hipblasDasum(handle, (CeedInt)sub_length, (double *)d_array_start, 1, &sub_norm)); - *norm += sub_norm; - } +#if defined(CEED_SCALAR_IS_FP32) +#if (HIP_VERSION >= 60000000) // We have ROCm 6, and can use 64-bit integers + CeedCallHipblas(ceed, hipblasSasum_64(handle, (int64_t)length, (float *)d_array, 1, (float *)norm)); + CeedCallHip(ceed, hipStreamSynchronize(stream)); +#else /* HIP_VERSION */ + float sub_norm = 0.0; + float *d_array_start; + + for (CeedInt i = 0; i < num_calls; i++) { + d_array_start = (float *)d_array + (CeedSize)(i)*INT_MAX; + CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX; + CeedInt sub_length = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX; + + CeedCallHipblas(ceed, hipblasSasum(handle, (CeedInt)sub_length, (float *)d_array_start, 1, &sub_norm)); + CeedCallHip(ceed, hipStreamSynchronize(stream)); + *norm += sub_norm; + } +#endif /* HIP_VERSION */ +#else /* CEED_SCALAR */ +#if (HIP_VERSION >= 60000000) + CeedCallHipblas(ceed, hipblasDasum_64(handle, (int64_t)length, (double *)d_array, 1, (double *)norm)); + CeedCallHip(ceed, hipStreamSynchronize(stream)); +#else /* HIP_VERSION */ + double sub_norm = 0.0; + double *d_array_start; + + for (CeedInt i = 0; i < num_calls; i++) { + d_array_start = (double *)d_array + (CeedSize)(i)*INT_MAX; + CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX; + CeedInt sub_length = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX; + + CeedCallHipblas(ceed, hipblasDasum(handle, (CeedInt)sub_length, (double *)d_array_start, 1, &sub_norm)); + CeedCallHip(ceed, hipStreamSynchronize(stream)); + *norm += sub_norm; } +#endif /* HIP_VERSION */ +#endif /* CEED_SCALAR */ break; } case CEED_NORM_2: { - if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) { - float sub_norm = 0.0, norm_sum = 0.0; - float *d_array_start; - - for (CeedInt i = 0; i < num_calls; i++) { - d_array_start = (float *)d_array + (CeedSize)(i)*INT_MAX; - CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX; - CeedInt sub_length = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX; - - CeedCallHipblas(ceed, hipblasSnrm2(handle, (CeedInt)sub_length, (float *)d_array_start, 1, &sub_norm)); - norm_sum += sub_norm * sub_norm; - } - *norm = sqrt(norm_sum); - } else { - double sub_norm = 0.0, norm_sum = 0.0; - double *d_array_start; - - for (CeedInt i = 0; i < num_calls; i++) { - d_array_start = (double *)d_array + (CeedSize)(i)*INT_MAX; - CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX; - CeedInt sub_length = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX; - - CeedCallHipblas(ceed, hipblasDnrm2(handle, (CeedInt)sub_length, (double *)d_array_start, 1, &sub_norm)); - norm_sum += sub_norm * sub_norm; - } - *norm = sqrt(norm_sum); +#if defined(CEED_SCALAR_IS_FP32) +#if (HIP_VERSION >= 60000000) + CeedCallHipblas(ceed, hipblasSnrm2_64(handle, (int64_t)length, (float *)d_array, 1, (float *)norm)); + CeedCallHip(ceed, hipStreamSynchronize(stream)); +#else /* HIP_VERSION */ + float sub_norm = 0.0, norm_sum = 0.0; + float *d_array_start; + + for (CeedInt i = 0; i < num_calls; i++) { + d_array_start = (float *)d_array + (CeedSize)(i)*INT_MAX; + CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX; + CeedInt sub_length = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX; + + CeedCallHipblas(ceed, hipblasSnrm2(handle, (CeedInt)sub_length, (float *)d_array_start, 1, &sub_norm)); + CeedCallHip(ceed, hipStreamSynchronize(stream)); + norm_sum += sub_norm * sub_norm; + } + *norm = sqrt(norm_sum); +#endif /* HIP_VERSION */ +#else /* CEED_SCALAR */ +#if (HIP_VERSION >= 60000000) + CeedCallHipblas(ceed, hipblasDnrm2_64(handle, (int64_t)length, (double *)d_array, 1, (double *)norm)); + CeedCallHip(ceed, hipStreamSynchronize(stream)); +#else /* HIP_VERSION */ + double sub_norm = 0.0, norm_sum = 0.0; + double *d_array_start; + + for (CeedInt i = 0; i < num_calls; i++) { + d_array_start = (double *)d_array + (CeedSize)(i)*INT_MAX; + CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX; + CeedInt sub_length = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX; + + CeedCallHipblas(ceed, hipblasDnrm2(handle, (CeedInt)sub_length, (double *)d_array_start, 1, &sub_norm)); + CeedCallHip(ceed, hipStreamSynchronize(stream)); + norm_sum += sub_norm * sub_norm; } + *norm = sqrt(norm_sum); +#endif /* HIP_VERSION */ +#endif /* CEED_SCALAR */ break; } case CEED_NORM_MAX: { +#if defined(CEED_SCALAR_IS_FP32) +#if (HIP_VERSION >= 60000000) + int64_t index; + CeedScalar norm_no_abs; + + CeedCallHipblas(ceed, hipblasIsamax_64(handle, (int64_t)length, (float *)d_array, 1, &index)); + CeedCallHip(ceed, hipMemcpyAsync(&norm_no_abs, impl->d_array + index - 1, sizeof(CeedScalar), hipMemcpyDeviceToHost, stream)); + CeedCallHip(ceed, hipStreamSynchronize(stream)); + *norm = fabs(norm_no_abs); +#else /* HIP_VERSION */ CeedInt index; - - if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) { - float sub_max = 0.0, current_max = 0.0; - float *d_array_start; - for (CeedInt i = 0; i < num_calls; i++) { - d_array_start = (float *)d_array + (CeedSize)(i)*INT_MAX; - CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX; - CeedInt sub_length = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX; - - CeedCallHipblas(ceed, hipblasIsamax(handle, (CeedInt)sub_length, (float *)d_array_start, 1, &index)); - CeedCallHip(ceed, hipMemcpy(&sub_max, d_array_start + index - 1, sizeof(CeedScalar), hipMemcpyDeviceToHost)); - if (fabs(sub_max) > current_max) current_max = fabs(sub_max); + float sub_max = 0.0, current_max = 0.0; + float *d_array_start; + + for (CeedInt i = 0; i < num_calls; i++) { + d_array_start = (float *)d_array + (CeedSize)(i)*INT_MAX; + CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX; + CeedInt sub_length = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX; + + CeedCallHipblas(ceed, hipblasIsamax(handle, (CeedInt)sub_length, (float *)d_array_start, 1, &index)); + if (hip_data->has_unified_addressing) { + CeedCallHip(ceed, hipStreamSynchronize(stream)); + sub_max = fabs(d_array[index - 1]); + } else { + CeedCallHip(ceed, hipMemcpyAsync(&sub_max, d_array_start + index - 1, sizeof(CeedScalar), hipMemcpyDeviceToHost, stream)); + CeedCallHip(ceed, hipStreamSynchronize(stream)); } - *norm = current_max; + if (fabs(sub_max) > current_max) current_max = fabs(sub_max); + } + *norm = current_max; +#endif /* HIP_VERSION */ +#else /* CEED_SCALAR */ +#if (HIP_VERSION >= 60000000) + int64_t index; + CeedScalar norm_no_abs; + + CeedCallHipblas(ceed, hipblasIdamax_64(handle, (int64_t)length, (double *)d_array, 1, &index)); + if (hip_data->has_unified_addressing) { + CeedCallHip(ceed, hipStreamSynchronize(stream)); + norm_no_abs = fabs(d_array[index - 1]); } else { - double sub_max = 0.0, current_max = 0.0; - double *d_array_start; - - for (CeedInt i = 0; i < num_calls; i++) { - d_array_start = (double *)d_array + (CeedSize)(i)*INT_MAX; - CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX; - CeedInt sub_length = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX; - - CeedCallHipblas(ceed, hipblasIdamax(handle, (CeedInt)sub_length, (double *)d_array_start, 1, &index)); - CeedCallHip(ceed, hipMemcpy(&sub_max, d_array_start + index - 1, sizeof(CeedScalar), hipMemcpyDeviceToHost)); - if (fabs(sub_max) > current_max) current_max = fabs(sub_max); + CeedCallHip(ceed, hipMemcpyAsync(&norm_no_abs, impl->d_array + index - 1, sizeof(CeedScalar), hipMemcpyDeviceToHost, stream)); + CeedCallHip(ceed, hipStreamSynchronize(stream)); + } + *norm = fabs(norm_no_abs); +#else /* HIP_VERSION */ + CeedInt index; + double sub_max = 0.0, current_max = 0.0; + double *d_array_start; + + for (CeedInt i = 0; i < num_calls; i++) { + d_array_start = (double *)d_array + (CeedSize)(i)*INT_MAX; + CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX; + CeedInt sub_length = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX; + + CeedCallHipblas(ceed, hipblasIdamax(handle, (CeedInt)sub_length, (double *)d_array_start, 1, &index)); + if (hip_data->has_unified_addressing) { + CeedCallHip(ceed, hipStreamSynchronize(stream)); + sub_max = fabs(d_array[index - 1]); + } else { + CeedCallHip(ceed, hipMemcpyAsync(&sub_max, d_array_start + index - 1, sizeof(CeedScalar), hipMemcpyDeviceToHost, stream)); + CeedCallHip(ceed, hipStreamSynchronize(stream)); } - *norm = current_max; + if (fabs(sub_max) > current_max) current_max = fabs(sub_max); } + *norm = current_max; +#endif /* HIP_VERSION */ +#endif /* CEED_SCALAR */ break; } } CeedCallBackend(CeedVectorRestoreArrayRead(vec, &d_array)); + CeedCallBackend(CeedDestroy(&ceed)); return CEED_ERROR_SUCCESS; } @@ -506,7 +754,7 @@ static int CeedHostReciprocal_Hip(CeedScalar *h_array, CeedSize length) { } //------------------------------------------------------------------------------ -// Take reciprocal of a vector on device (impl in .cu file) +// Take reciprocal of a vector on device (impl in .hip.cpp file) //------------------------------------------------------------------------------ int CeedDeviceReciprocal_Hip(CeedScalar *d_array, CeedSize length); @@ -534,7 +782,7 @@ static int CeedHostScale_Hip(CeedScalar *x_array, CeedScalar alpha, CeedSize len } //------------------------------------------------------------------------------ -// Compute x = alpha x on device (impl in .cu file) +// Compute x = alpha x on device (impl in .hip.cpp file) //------------------------------------------------------------------------------ int CeedDeviceScale_Hip(CeedScalar *x_array, CeedScalar alpha, CeedSize length); @@ -543,13 +791,33 @@ int CeedDeviceScale_Hip(CeedScalar *x_array, CeedScalar alpha, CeedSize length); //------------------------------------------------------------------------------ static int CeedVectorScale_Hip(CeedVector x, CeedScalar alpha) { CeedSize length; - CeedVector_Hip *x_impl; + CeedVector_Hip *impl; - CeedCallBackend(CeedVectorGetData(x, &x_impl)); + CeedCallBackend(CeedVectorGetData(x, &impl)); CeedCallBackend(CeedVectorGetLength(x, &length)); // Set value for synced device/host array - if (x_impl->d_array) CeedCallBackend(CeedDeviceScale_Hip(x_impl->d_array, alpha, length)); - if (x_impl->h_array) CeedCallBackend(CeedHostScale_Hip(x_impl->h_array, alpha, length)); + if (impl->d_array) { +#if (HIP_VERSION >= 60000000) + hipblasHandle_t handle; + hipStream_t stream; + + CeedCallBackend(CeedGetHipblasHandle_Hip(CeedVectorReturnCeed(x), &handle)); + CeedCallHipblas(CeedVectorReturnCeed(x), hipblasGetStream(handle, &stream)); +#if defined(CEED_SCALAR_IS_FP32) + CeedCallHipblas(CeedVectorReturnCeed(x), hipblasSscal_64(handle, (int64_t)length, &alpha, impl->d_array, 1)); +#else /* CEED_SCALAR */ + CeedCallHipblas(CeedVectorReturnCeed(x), hipblasDscal_64(handle, (int64_t)length, &alpha, impl->d_array, 1)); +#endif /* CEED_SCALAR */ + CeedCallHip(CeedVectorReturnCeed(x), hipStreamSynchronize(stream)); +#else /* HIP_VERSION */ + CeedCallBackend(CeedDeviceScale_Hip(impl->d_array, alpha, length)); +#endif /* HIP_VERSION */ + impl->h_array = NULL; + } + if (impl->h_array) { + CeedCallBackend(CeedHostScale_Hip(impl->h_array, alpha, length)); + impl->d_array = NULL; + } return CEED_ERROR_SUCCESS; } @@ -562,7 +830,7 @@ static int CeedHostAXPY_Hip(CeedScalar *y_array, CeedScalar alpha, CeedScalar *x } //------------------------------------------------------------------------------ -// Compute y = alpha x + y on device (impl in .cu file) +// Compute y = alpha x + y on device (impl in .hip.cpp file) //------------------------------------------------------------------------------ int CeedDeviceAXPY_Hip(CeedScalar *y_array, CeedScalar alpha, CeedScalar *x_array, CeedSize length); @@ -579,11 +847,26 @@ static int CeedVectorAXPY_Hip(CeedVector y, CeedScalar alpha, CeedVector x) { // Set value for synced device/host array if (y_impl->d_array) { CeedCallBackend(CeedVectorSyncArray(x, CEED_MEM_DEVICE)); +#if (HIP_VERSION >= 60000000) + hipblasHandle_t handle; + hipStream_t stream; + + CeedCallBackend(CeedGetHipblasHandle_Hip(CeedVectorReturnCeed(x), &handle)); + CeedCallHipblas(CeedVectorReturnCeed(y), hipblasGetStream(handle, &stream)); +#if defined(CEED_SCALAR_IS_FP32) + CeedCallHipblas(CeedVectorReturnCeed(y), hipblasSaxpy_64(handle, (int64_t)length, &alpha, x_impl->d_array, 1, y_impl->d_array, 1)); +#else /* CEED_SCALAR */ + CeedCallHipblas(CeedVectorReturnCeed(y), hipblasDaxpy_64(handle, (int64_t)length, &alpha, x_impl->d_array, 1, y_impl->d_array, 1)); +#endif /* CEED_SCALAR */ + CeedCallHip(CeedVectorReturnCeed(y), hipStreamSynchronize(stream)); +#else /* HIP_VERSION */ CeedCallBackend(CeedDeviceAXPY_Hip(y_impl->d_array, alpha, x_impl->d_array, length)); - } - if (y_impl->h_array) { +#endif /* HIP_VERSION */ + y_impl->h_array = NULL; + } else if (y_impl->h_array) { CeedCallBackend(CeedVectorSyncArray(x, CEED_MEM_HOST)); CeedCallBackend(CeedHostAXPY_Hip(y_impl->h_array, alpha, x_impl->h_array, length)); + y_impl->d_array = NULL; } return CEED_ERROR_SUCCESS; } @@ -597,7 +880,7 @@ static int CeedHostAXPBY_Hip(CeedScalar *y_array, CeedScalar alpha, CeedScalar b } //------------------------------------------------------------------------------ -// Compute y = alpha x + beta y on device (impl in .cu file) +// Compute y = alpha x + beta y on device (impl in .hip.cpp file) //------------------------------------------------------------------------------ int CeedDeviceAXPBY_Hip(CeedScalar *y_array, CeedScalar alpha, CeedScalar beta, CeedScalar *x_array, CeedSize length); @@ -632,7 +915,7 @@ static int CeedHostPointwiseMult_Hip(CeedScalar *w_array, CeedScalar *x_array, C } //------------------------------------------------------------------------------ -// Compute the pointwise multiplication w = x .* y on device (impl in .cu file) +// Compute the pointwise multiplication w = x .* y on device (impl in .hip.cpp file) //------------------------------------------------------------------------------ int CeedDevicePointwiseMult_Hip(CeedScalar *w_array, CeedScalar *x_array, CeedScalar *y_array, CeedSize length); @@ -683,6 +966,7 @@ static int CeedVectorDestroy_Hip(const CeedVector vec) { //------------------------------------------------------------------------------ int CeedVectorCreate_Hip(CeedSize n, CeedVector vec) { CeedVector_Hip *impl; + Ceed_Hip *hip_impl; Ceed ceed; CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); @@ -690,19 +974,24 @@ int CeedVectorCreate_Hip(CeedSize n, CeedVector vec) { CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "HasBorrowedArrayOfType", CeedVectorHasBorrowedArrayOfType_Hip)); CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "SetArray", CeedVectorSetArray_Hip)); CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "TakeArray", CeedVectorTakeArray_Hip)); - CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "SetValue", (int (*)())CeedVectorSetValue_Hip)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "CopyStrided", CeedVectorCopyStrided_Hip)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "SetValue", CeedVectorSetValue_Hip)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "SetValueStrided", CeedVectorSetValueStrided_Hip)); CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "SyncArray", CeedVectorSyncArray_Hip)); CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "GetArray", CeedVectorGetArray_Hip)); CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "GetArrayRead", CeedVectorGetArrayRead_Hip)); CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "GetArrayWrite", CeedVectorGetArrayWrite_Hip)); CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "Norm", CeedVectorNorm_Hip)); CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "Reciprocal", CeedVectorReciprocal_Hip)); - CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "Scale", (int (*)())CeedVectorScale_Hip)); - CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "AXPY", (int (*)())CeedVectorAXPY_Hip)); - CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "AXPBY", (int (*)())CeedVectorAXPBY_Hip)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "Scale", CeedVectorScale_Hip)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "AXPY", CeedVectorAXPY_Hip)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "AXPBY", CeedVectorAXPBY_Hip)); CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "PointwiseMult", CeedVectorPointwiseMult_Hip)); CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "Destroy", CeedVectorDestroy_Hip)); CeedCallBackend(CeedCalloc(1, &impl)); + CeedCallBackend(CeedGetData(ceed, &hip_impl)); + CeedCallBackend(CeedDestroy(&ceed)); + impl->has_unified_addressing = hip_impl->has_unified_addressing; CeedCallBackend(CeedVectorSetData(vec, impl)); return CEED_ERROR_SUCCESS; } diff --git a/backends/hip-ref/ceed-hip-ref.c b/backends/hip-ref/ceed-hip-ref.c index 8494d127e3..f22f3a16e7 100644 --- a/backends/hip-ref/ceed-hip-ref.c +++ b/backends/hip-ref/ceed-hip-ref.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -29,7 +29,10 @@ int CeedGetHipblasHandle_Hip(Ceed ceed, hipblasHandle_t *handle) { Ceed_Hip *data; CeedCallBackend(CeedGetData(ceed, &data)); - if (!data->hipblas_handle) CeedCallHipblas(ceed, hipblasCreate(&data->hipblas_handle)); + if (!data->hipblas_handle) { + CeedCallHipblas(ceed, hipblasCreate(&data->hipblas_handle)); + CeedCallHipblas(ceed, hipblasSetPointerMode(data->hipblas_handle, HIPBLAS_POINTER_MODE_HOST)); + } *handle = data->hipblas_handle; return CEED_ERROR_SUCCESS; } @@ -57,9 +60,11 @@ static int CeedInit_Hip_ref(const char *resource, Ceed ceed) { CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "BasisCreateHdiv", CeedBasisCreateHdiv_Hip)); CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "BasisCreateHcurl", CeedBasisCreateHcurl_Hip)); CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "ElemRestrictionCreate", CeedElemRestrictionCreate_Hip)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "ElemRestrictionCreateAtPoints", CeedElemRestrictionCreate_Hip)); CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "QFunctionCreate", CeedQFunctionCreate_Hip)); CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "QFunctionContextCreate", CeedQFunctionContextCreate_Hip)); CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "OperatorCreate", CeedOperatorCreate_Hip)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "OperatorCreateAtPoints", CeedOperatorCreateAtPoints_Hip)); CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "Destroy", CeedDestroy_Hip)); return CEED_ERROR_SUCCESS; } diff --git a/backends/hip-ref/ceed-hip-ref.h b/backends/hip-ref/ceed-hip-ref.h index 815790c53c..2e7ee88313 100644 --- a/backends/hip-ref/ceed-hip-ref.h +++ b/backends/hip-ref/ceed-hip-ref.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -17,6 +17,7 @@ #endif typedef struct { + int has_unified_addressing; CeedScalar *h_array; CeedScalar *h_array_borrowed; CeedScalar *h_array_owned; @@ -52,6 +53,18 @@ typedef struct { const CeedInt8 *d_curl_orients; const CeedInt8 *d_curl_orients_borrowed; const CeedInt8 *d_curl_orients_owned; + const CeedInt *h_offsets_at_points; + const CeedInt *h_offsets_at_points_borrowed; + const CeedInt *h_offsets_at_points_owned; + const CeedInt *d_offsets_at_points; + const CeedInt *d_offsets_at_points_borrowed; + const CeedInt *d_offsets_at_points_owned; + const CeedInt *h_points_per_elem; + const CeedInt *h_points_per_elem_borrowed; + const CeedInt *h_points_per_elem_owned; + const CeedInt *d_points_per_elem; + const CeedInt *d_points_per_elem_borrowed; + const CeedInt *d_points_per_elem_owned; } CeedElemRestriction_Hip; typedef struct { @@ -59,9 +72,19 @@ typedef struct { hipFunction_t Interp; hipFunction_t Grad; hipFunction_t Weight; + hipModule_t moduleAtPoints; + CeedInt num_points; + hipFunction_t InterpAtPoints; + hipFunction_t InterpTransposeAtPoints; + hipFunction_t GradAtPoints; + hipFunction_t GradTransposeAtPoints; CeedScalar *d_interp_1d; CeedScalar *d_grad_1d; CeedScalar *d_q_weight_1d; + CeedScalar *d_chebyshev_interp_1d; + CeedInt num_elem_at_points; + CeedInt *h_points_per_elem; + CeedInt *d_points_per_elem; } CeedBasis_Hip; typedef struct { @@ -81,7 +104,6 @@ typedef struct { typedef struct { hipModule_t module; const char *qfunction_name; - const char *qfunction_source; hipFunction_t QFunction; Fields_Hip fields; void *d_c; @@ -115,12 +137,17 @@ typedef struct { } CeedOperatorAssemble_Hip; typedef struct { - CeedVector *e_vecs; // E-vectors, inputs followed by outputs - CeedVector *q_vecs_in; // Input Q-vectors needed to apply operator - CeedVector *q_vecs_out; // Output Q-vectors needed to apply operator + bool *skip_rstr_in, *skip_rstr_out, *apply_add_basis_out; + uint64_t *input_states, points_state; // State tracking for passive inputs + CeedVector *e_vecs_in, *e_vecs_out; + CeedVector *q_vecs_in, *q_vecs_out; CeedInt num_inputs, num_outputs; CeedInt num_active_in, num_active_out; - CeedVector *qf_active_in; + CeedInt *input_field_order, *output_field_order; + CeedSize max_active_e_vec_len; + CeedInt max_num_points; + CeedInt *num_points; + CeedVector *qf_active_in, point_coords_elem; CeedOperatorDiag_Hip *diag; CeedOperatorAssemble_Hip *asmb; } CeedOperator_Hip; @@ -146,3 +173,4 @@ CEED_INTERN int CeedQFunctionCreate_Hip(CeedQFunction qf); CEED_INTERN int CeedQFunctionContextCreate_Hip(CeedQFunctionContext ctx); CEED_INTERN int CeedOperatorCreate_Hip(CeedOperator op); +CEED_INTERN int CeedOperatorCreateAtPoints_Hip(CeedOperator op); diff --git a/backends/hip-ref/kernels/hip-ref-vector.hip.cpp b/backends/hip-ref/kernels/hip-ref-vector.hip.cpp index 5f6dd15f2a..b9f81032b5 100644 --- a/backends/hip-ref/kernels/hip-ref-vector.hip.cpp +++ b/backends/hip-ref/kernels/hip-ref-vector.hip.cpp @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -8,14 +8,37 @@ #include #include +//------------------------------------------------------------------------------ +// Kernel for copy strided on device +//------------------------------------------------------------------------------ +__global__ static void copyStridedK(CeedScalar *__restrict__ vec, CeedSize start, CeedSize step, CeedSize size, CeedScalar *__restrict__ vec_copy) { + const CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; + + if (index < size) { + if ((index - start) % step == 0) vec_copy[index] = vec[index]; + } +} + +//------------------------------------------------------------------------------ +// Copy strided on device memory +//------------------------------------------------------------------------------ +extern "C" int CeedDeviceCopyStrided_Hip(CeedScalar *d_array, CeedSize start, CeedSize step, CeedSize length, CeedScalar *d_copy_array) { + const int block_size = 512; + const CeedSize vec_size = length; + int grid_size = vec_size / block_size; + + if (block_size * grid_size < vec_size) grid_size += 1; + hipLaunchKernelGGL(copyStridedK, dim3(grid_size), dim3(block_size), 0, 0, d_array, start, step, length, d_copy_array); + return 0; +} + //------------------------------------------------------------------------------ // Kernel for set value on device //------------------------------------------------------------------------------ __global__ static void setValueK(CeedScalar *__restrict__ vec, CeedSize size, CeedScalar val) { - CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; + const CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; - if (index >= size) return; - vec[index] = val; + if (index < size) vec[index] = val; } //------------------------------------------------------------------------------ @@ -31,14 +54,39 @@ extern "C" int CeedDeviceSetValue_Hip(CeedScalar *d_array, CeedSize length, Ceed return 0; } +//------------------------------------------------------------------------------ +// Kernel for set value strided on device +//------------------------------------------------------------------------------ +__global__ static void setValueStridedK(CeedScalar *__restrict__ vec, CeedSize start, CeedSize stop, CeedSize step, CeedScalar val) { + const CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; + + if (index < stop - start) { + if (index % step == 0) vec[start + index] = val; + } +} + +//------------------------------------------------------------------------------ +// Set value strided on device memory +//------------------------------------------------------------------------------ +extern "C" int CeedDeviceSetValueStrided_Hip(CeedScalar *d_array, CeedSize start, CeedInt stop, CeedSize step, CeedSize length, CeedScalar val) { + const int block_size = 512; + const CeedSize set_size = stop - start; + int grid_size = set_size / block_size; + + if (block_size * grid_size < set_size) grid_size += 1; + hipLaunchKernelGGL(setValueStridedK, dim3(grid_size), dim3(block_size), 0, 0, d_array, start, stop, step, val); + return 0; +} + //------------------------------------------------------------------------------ // Kernel for taking reciprocal //------------------------------------------------------------------------------ __global__ static void rcpValueK(CeedScalar *__restrict__ vec, CeedSize size) { - CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; + const CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; - if (index >= size) return; - if (fabs(vec[index]) > 1E-16) vec[index] = 1. / vec[index]; + if (index < size) { + if (fabs(vec[index]) > 1E-16) vec[index] = 1. / vec[index]; + } } //------------------------------------------------------------------------------ @@ -58,10 +106,9 @@ extern "C" int CeedDeviceReciprocal_Hip(CeedScalar *d_array, CeedSize length) { // Kernel for scale //------------------------------------------------------------------------------ __global__ static void scaleValueK(CeedScalar *__restrict__ x, CeedScalar alpha, CeedSize size) { - CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; + const CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; - if (index >= size) return; - x[index] *= alpha; + if (index < size) x[index] *= alpha; } //------------------------------------------------------------------------------ @@ -81,10 +128,9 @@ extern "C" int CeedDeviceScale_Hip(CeedScalar *x_array, CeedScalar alpha, CeedSi // Kernel for axpy //------------------------------------------------------------------------------ __global__ static void axpyValueK(CeedScalar *__restrict__ y, CeedScalar alpha, CeedScalar *__restrict__ x, CeedSize size) { - CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; + const CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; - if (index >= size) return; - y[index] += alpha * x[index]; + if (index < size) y[index] += alpha * x[index]; } //------------------------------------------------------------------------------ @@ -104,11 +150,12 @@ extern "C" int CeedDeviceAXPY_Hip(CeedScalar *y_array, CeedScalar alpha, CeedSca // Kernel for axpby //------------------------------------------------------------------------------ __global__ static void axpbyValueK(CeedScalar *__restrict__ y, CeedScalar alpha, CeedScalar beta, CeedScalar *__restrict__ x, CeedSize size) { - CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; + const CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; - if (index >= size) return; - y[index] = beta * y[index]; - y[index] += alpha * x[index]; + if (index < size) { + y[index] = beta * y[index]; + y[index] += alpha * x[index]; + } } //------------------------------------------------------------------------------ @@ -128,10 +175,9 @@ extern "C" int CeedDeviceAXPBY_Hip(CeedScalar *y_array, CeedScalar alpha, CeedSc // Kernel for pointwise mult //------------------------------------------------------------------------------ __global__ static void pointwiseMultValueK(CeedScalar *__restrict__ w, CeedScalar *x, CeedScalar *__restrict__ y, CeedSize size) { - CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; + const CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; - if (index >= size) return; - w[index] = x[index] * y[index]; + if (index < size) w[index] = x[index] * y[index]; } //------------------------------------------------------------------------------ diff --git a/backends/hip-shared/ceed-hip-shared-basis.c b/backends/hip-shared/ceed-hip-shared-basis.c index 7642043d4c..3fb4c93630 100644 --- a/backends/hip-shared/ceed-hip-shared-basis.c +++ b/backends/hip-shared/ceed-hip-shared-basis.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -10,6 +10,7 @@ #include #include #include +#include #include #include "../hip/ceed-hip-common.h" @@ -87,8 +88,8 @@ static int ComputeBasisThreadBlockSizes(const CeedInt dim, const CeedInt P_1d, c //------------------------------------------------------------------------------ // Apply basis //------------------------------------------------------------------------------ -int CeedBasisApplyTensor_Hip_shared(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, - CeedVector v) { +static int CeedBasisApplyTensorCore_Hip_shared(CeedBasis basis, bool apply_add, const CeedInt num_elem, CeedTransposeMode t_mode, + CeedEvalMode eval_mode, CeedVector u, CeedVector v) { Ceed ceed; Ceed_Hip *ceed_Hip; CeedInt dim, num_comp; @@ -105,7 +106,11 @@ int CeedBasisApplyTensor_Hip_shared(CeedBasis basis, const CeedInt num_elem, Cee // Get read/write access to u, v if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); - CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); + if (apply_add) { + CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v)); + } else { + CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); + } // Apply basis operation switch (eval_mode) { @@ -113,6 +118,7 @@ int CeedBasisApplyTensor_Hip_shared(CeedBasis basis, const CeedInt num_elem, Cee CeedInt P_1d, Q_1d; CeedInt block_size = data->block_sizes[0]; + CeedCheck(data->d_interp_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; interp_1d not set", CeedEvalModes[eval_mode]); CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); CeedInt thread_1d = CeedIntMax(Q_1d, P_1d); @@ -121,36 +127,37 @@ int CeedBasisApplyTensor_Hip_shared(CeedBasis basis, const CeedInt num_elem, Cee if (dim == 1) { CeedInt elems_per_block = 64 * thread_1d > 256 ? 256 / thread_1d : 64; elems_per_block = elems_per_block > 0 ? elems_per_block : 1; - CeedInt grid = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0); + CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); CeedInt shared_mem = elems_per_block * thread_1d * sizeof(CeedScalar); if (t_mode == CEED_TRANSPOSE) { - CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->InterpTranspose, grid, thread_1d, 1, elems_per_block, shared_mem, interp_args)); + CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, NULL, grid, thread_1d, 1, + elems_per_block, shared_mem, interp_args)); } else { - CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->Interp, grid, thread_1d, 1, elems_per_block, shared_mem, interp_args)); + CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->Interp, NULL, grid, thread_1d, 1, elems_per_block, shared_mem, interp_args)); } } else if (dim == 2) { // Check if required threads is small enough to do multiple elems const CeedInt elems_per_block = CeedIntMax(block_size / (thread_1d * thread_1d), 1); - CeedInt grid = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0); + CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); if (t_mode == CEED_TRANSPOSE) { - CeedCallBackend( - CeedRunKernelDimShared_Hip(ceed, data->InterpTranspose, grid, thread_1d, thread_1d, elems_per_block, shared_mem, interp_args)); + CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, NULL, grid, thread_1d, + thread_1d, elems_per_block, shared_mem, interp_args)); } else { - CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->Interp, grid, thread_1d, thread_1d, elems_per_block, shared_mem, interp_args)); + CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->Interp, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem, interp_args)); } } else if (dim == 3) { const CeedInt elems_per_block = CeedIntMax(block_size / (thread_1d * thread_1d), 1); - CeedInt grid = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0); + CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); if (t_mode == CEED_TRANSPOSE) { - CeedCallBackend( - CeedRunKernelDimShared_Hip(ceed, data->InterpTranspose, grid, thread_1d, thread_1d, elems_per_block, shared_mem, interp_args)); + CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, NULL, grid, thread_1d, + thread_1d, elems_per_block, shared_mem, interp_args)); } else { - CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->Interp, grid, thread_1d, thread_1d, elems_per_block, shared_mem, interp_args)); + CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->Interp, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem, interp_args)); } } } break; @@ -158,6 +165,7 @@ int CeedBasisApplyTensor_Hip_shared(CeedBasis basis, const CeedInt num_elem, Cee CeedInt P_1d, Q_1d; CeedInt block_size = data->block_sizes[1]; + CeedCheck(data->d_grad_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; grad_1d not set", CeedEvalModes[eval_mode]); CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); CeedInt thread_1d = CeedIntMax(Q_1d, P_1d); @@ -167,37 +175,41 @@ int CeedBasisApplyTensor_Hip_shared(CeedBasis basis, const CeedInt num_elem, Cee d_grad_1d = data->d_collo_grad_1d; } void *grad_args[] = {(void *)&num_elem, &data->d_interp_1d, &d_grad_1d, &d_u, &d_v}; + if (dim == 1) { CeedInt elems_per_block = 64 * thread_1d > 256 ? 256 / thread_1d : 64; elems_per_block = elems_per_block > 0 ? elems_per_block : 1; - CeedInt grid = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0); + CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); CeedInt shared_mem = elems_per_block * thread_1d * sizeof(CeedScalar); if (t_mode == CEED_TRANSPOSE) { - CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->GradTranspose, grid, thread_1d, 1, elems_per_block, shared_mem, grad_args)); + CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->GradTransposeAdd : data->GradTranspose, NULL, grid, thread_1d, 1, + elems_per_block, shared_mem, grad_args)); } else { - CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->Grad, grid, thread_1d, 1, elems_per_block, shared_mem, grad_args)); + CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->Grad, NULL, grid, thread_1d, 1, elems_per_block, shared_mem, grad_args)); } } else if (dim == 2) { // Check if required threads is small enough to do multiple elems const CeedInt elems_per_block = CeedIntMax(block_size / (thread_1d * thread_1d), 1); - CeedInt grid = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0); + CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); if (t_mode == CEED_TRANSPOSE) { - CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->GradTranspose, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args)); + CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->GradTransposeAdd : data->GradTranspose, NULL, grid, thread_1d, thread_1d, + elems_per_block, shared_mem, grad_args)); } else { - CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->Grad, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args)); + CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->Grad, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args)); } } else if (dim == 3) { const CeedInt elems_per_block = CeedIntMax(block_size / (thread_1d * thread_1d), 1); - CeedInt grid = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0); + CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); if (t_mode == CEED_TRANSPOSE) { - CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->GradTranspose, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args)); + CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->GradTransposeAdd : data->GradTranspose, NULL, grid, thread_1d, thread_1d, + elems_per_block, shared_mem, grad_args)); } else { - CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->Grad, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args)); + CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->Grad, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args)); } } } break; @@ -205,25 +217,26 @@ int CeedBasisApplyTensor_Hip_shared(CeedBasis basis, const CeedInt num_elem, Cee CeedInt Q_1d; CeedInt block_size = data->block_sizes[2]; + CeedCheck(data->d_q_weight_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weights_1d not set", CeedEvalModes[eval_mode]); CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); void *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight_1d, &d_v}; if (dim == 1) { const CeedInt opt_elems = block_size / Q_1d; const CeedInt elems_per_block = opt_elems > 0 ? opt_elems : 1; - const CeedInt grid_size = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0); + const CeedInt grid_size = num_elem / elems_per_block + (num_elem % elems_per_block > 0); CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->Weight, grid_size, Q_1d, elems_per_block, 1, weight_args)); } else if (dim == 2) { const CeedInt opt_elems = block_size / (Q_1d * Q_1d); const CeedInt elems_per_block = opt_elems > 0 ? opt_elems : 1; - const CeedInt grid_size = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0); + const CeedInt grid_size = num_elem / elems_per_block + (num_elem % elems_per_block > 0); CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->Weight, grid_size, Q_1d, Q_1d, elems_per_block, weight_args)); } else if (dim == 3) { const CeedInt opt_elems = block_size / (Q_1d * Q_1d); const CeedInt elems_per_block = opt_elems > 0 ? opt_elems : 1; - const CeedInt grid_size = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0); + const CeedInt grid_size = num_elem / elems_per_block + (num_elem % elems_per_block > 0); CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->Weight, grid_size, Q_1d, Q_1d, elems_per_block, weight_args)); } @@ -241,6 +254,375 @@ int CeedBasisApplyTensor_Hip_shared(CeedBasis basis, const CeedInt num_elem, Cee CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u)); if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); + CeedCallBackend(CeedDestroy(&ceed)); + return CEED_ERROR_SUCCESS; +} + +int CeedBasisApplyTensor_Hip_shared(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, + CeedVector v) { + CeedCallBackend(CeedBasisApplyTensorCore_Hip_shared(basis, false, num_elem, t_mode, eval_mode, u, v)); + return CEED_ERROR_SUCCESS; +} + +int CeedBasisApplyAddTensor_Hip_shared(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, + CeedVector v) { + CeedCallBackend(CeedBasisApplyTensorCore_Hip_shared(basis, true, num_elem, t_mode, eval_mode, u, v)); + return CEED_ERROR_SUCCESS; +} + +//------------------------------------------------------------------------------ +// Basis apply - tensor AtPoints +//------------------------------------------------------------------------------ +static int CeedBasisApplyAtPointsCore_Hip_shared(CeedBasis basis, bool apply_add, const CeedInt num_elem, const CeedInt *num_points, + CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) { + Ceed ceed; + CeedInt Q_1d, dim, max_num_points = num_points[0]; + const CeedInt is_transpose = t_mode == CEED_TRANSPOSE; + const CeedScalar *d_x, *d_u; + CeedScalar *d_v; + CeedBasis_Hip_shared *data; + + CeedCallBackend(CeedBasisGetData(basis, &data)); + CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); + CeedCallBackend(CeedBasisGetDimension(basis, &dim)); + + // Weight handled separately + if (eval_mode == CEED_EVAL_WEIGHT) { + CeedCallBackend(CeedVectorSetValue(v, 1.0)); + return CEED_ERROR_SUCCESS; + } + + CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); + + // Check padded to uniform number of points per elem + for (CeedInt i = 1; i < num_elem; i++) max_num_points = CeedIntMax(max_num_points, num_points[i]); + { + CeedInt num_comp, q_comp; + CeedSize len, len_required; + + CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); + CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, eval_mode, &q_comp)); + CeedCallBackend(CeedVectorGetLength(is_transpose ? u : v, &len)); + len_required = (CeedSize)num_comp * (CeedSize)q_comp * (CeedSize)num_elem * (CeedSize)max_num_points; + CeedCheck(len >= len_required, ceed, CEED_ERROR_BACKEND, + "Vector at points must be padded to the same number of points in each element for BasisApplyAtPoints on GPU backends." + " Found %" CeedSize_FMT ", Required %" CeedSize_FMT, + len, len_required); + } + + // Move num_points array to device + if (is_transpose) { + const CeedInt num_bytes = num_elem * sizeof(CeedInt); + + if (num_elem != data->num_elem_at_points) { + data->num_elem_at_points = num_elem; + + if (data->d_points_per_elem) CeedCallHip(ceed, hipFree(data->d_points_per_elem)); + CeedCallHip(ceed, hipMalloc((void **)&data->d_points_per_elem, num_bytes)); + CeedCallBackend(CeedFree(&data->h_points_per_elem)); + CeedCallBackend(CeedCalloc(num_elem, &data->h_points_per_elem)); + } + if (memcmp(data->h_points_per_elem, num_points, num_bytes)) { + memcpy(data->h_points_per_elem, num_points, num_bytes); + CeedCallHip(ceed, hipMemcpy(data->d_points_per_elem, num_points, num_bytes, hipMemcpyHostToDevice)); + } + } + + // Build kernels if needed + if (data->num_points != max_num_points) { + CeedInt P_1d; + + CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); + data->num_points = max_num_points; + + // -- Create interp matrix to Chebyshev coefficients + if (!data->d_chebyshev_interp_1d) { + CeedSize interp_bytes; + CeedScalar *chebyshev_interp_1d; + + interp_bytes = P_1d * Q_1d * sizeof(CeedScalar); + CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d)); + CeedCallBackend(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d)); + CeedCallHip(ceed, hipMalloc((void **)&data->d_chebyshev_interp_1d, interp_bytes)); + CeedCallHip(ceed, hipMemcpy(data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, hipMemcpyHostToDevice)); + CeedCallBackend(CeedFree(&chebyshev_interp_1d)); + } + + // -- Compile kernels + const char basis_kernel_source[] = "// AtPoints basis source\n#include \n"; + CeedInt num_comp; + + if (data->moduleAtPoints) CeedCallHip(ceed, hipModuleUnload(data->moduleAtPoints)); + CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); + CeedCallBackend(CeedCompile_Hip(ceed, basis_kernel_source, &data->moduleAtPoints, 9, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "BASIS_T_1D", + CeedIntMax(Q_1d, P_1d), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp, "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), + "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim), "BASIS_NUM_PTS", max_num_points, "BASIS_INTERP_BLOCK_SIZE", + data->block_sizes[0])); + CeedCallBackend(CeedGetKernel_Hip(ceed, data->moduleAtPoints, "InterpAtPoints", &data->InterpAtPoints)); + CeedCallBackend(CeedGetKernel_Hip(ceed, data->moduleAtPoints, "InterpTransposeAtPoints", &data->InterpTransposeAtPoints)); + CeedCallBackend(CeedGetKernel_Hip(ceed, data->moduleAtPoints, "InterpTransposeAddAtPoints", &data->InterpTransposeAddAtPoints)); + CeedCallBackend(CeedGetKernel_Hip(ceed, data->moduleAtPoints, "GradAtPoints", &data->GradAtPoints)); + CeedCallBackend(CeedGetKernel_Hip(ceed, data->moduleAtPoints, "GradTransposeAtPoints", &data->GradTransposeAtPoints)); + CeedCallBackend(CeedGetKernel_Hip(ceed, data->moduleAtPoints, "GradTransposeAddAtPoints", &data->GradTransposeAddAtPoints)); + } + + // Get read/write access to u, v + CeedCallBackend(CeedVectorGetArrayRead(x_ref, CEED_MEM_DEVICE, &d_x)); + if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); + else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); + if (apply_add) { + CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v)); + } else { + CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); + } + + // Basis action + switch (eval_mode) { + case CEED_EVAL_INTERP: { + CeedInt P_1d, Q_1d; + CeedInt block_size = data->block_sizes[0]; + + CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); + CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); + CeedInt thread_1d = CeedIntMax(Q_1d, P_1d); + void *interp_args[] = {(void *)&num_elem, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v}; + + if (dim == 1) { + CeedInt elems_per_block = 64 * thread_1d > 256 ? 256 / thread_1d : 64; + elems_per_block = elems_per_block > 0 ? elems_per_block : 1; + CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); + CeedInt shared_mem = elems_per_block * thread_1d * sizeof(CeedScalar); + + if (is_transpose) { + CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->InterpTransposeAddAtPoints : data->InterpTransposeAtPoints, NULL, grid, + thread_1d, 1, elems_per_block, shared_mem, interp_args)); + } else { + CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->InterpAtPoints, NULL, grid, thread_1d, 1, elems_per_block, shared_mem, interp_args)); + } + } else if (dim == 2) { + // Check if required threads is small enough to do multiple elems + const CeedInt elems_per_block = CeedIntMax(block_size / (thread_1d * thread_1d), 1); + CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); + CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); + + if (is_transpose) { + CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->InterpTransposeAddAtPoints : data->InterpTransposeAtPoints, NULL, grid, + thread_1d, thread_1d, elems_per_block, shared_mem, interp_args)); + } else { + CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->InterpAtPoints, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem, + interp_args)); + } + } else if (dim == 3) { + const CeedInt elems_per_block = 1; + CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); + CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); + + if (is_transpose) { + CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->InterpTransposeAddAtPoints : data->InterpTransposeAtPoints, NULL, grid, + thread_1d, thread_1d, elems_per_block, shared_mem, interp_args)); + } else { + CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->InterpAtPoints, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem, + interp_args)); + } + } + } break; + case CEED_EVAL_GRAD: { + CeedInt P_1d, Q_1d; + CeedInt block_size = data->block_sizes[0]; + + CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); + CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); + CeedInt thread_1d = CeedIntMax(Q_1d, P_1d); + void *grad_args[] = {(void *)&num_elem, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v}; + + if (dim == 1) { + CeedInt elems_per_block = 64 * thread_1d > 256 ? 256 / thread_1d : 64; + elems_per_block = elems_per_block > 0 ? elems_per_block : 1; + CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); + CeedInt shared_mem = elems_per_block * thread_1d * sizeof(CeedScalar); + + if (is_transpose) { + CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->GradTransposeAddAtPoints : data->GradTransposeAtPoints, NULL, grid, + thread_1d, 1, elems_per_block, shared_mem, grad_args)); + } else { + CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->GradAtPoints, NULL, grid, thread_1d, 1, elems_per_block, shared_mem, grad_args)); + } + } else if (dim == 2) { + // Check if required threads is small enough to do multiple elems + const CeedInt elems_per_block = CeedIntMax(block_size / (thread_1d * thread_1d), 1); + CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); + CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); + + if (is_transpose) { + CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->GradTransposeAddAtPoints : data->GradTransposeAtPoints, NULL, grid, + thread_1d, thread_1d, elems_per_block, shared_mem, grad_args)); + } else { + CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->GradAtPoints, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem, + grad_args)); + } + } else if (dim == 3) { + const CeedInt elems_per_block = 1; + CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); + CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); + + if (is_transpose) { + CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->GradTransposeAddAtPoints : data->GradTransposeAtPoints, NULL, grid, + thread_1d, thread_1d, elems_per_block, shared_mem, grad_args)); + } else { + CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->GradAtPoints, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem, + grad_args)); + } + } + } break; + case CEED_EVAL_WEIGHT: + case CEED_EVAL_NONE: /* handled separately below */ + break; + // LCOV_EXCL_START + case CEED_EVAL_DIV: + case CEED_EVAL_CURL: + return CeedError(ceed, CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]); + // LCOV_EXCL_STOP + } + + // Restore vectors, cover CEED_EVAL_NONE + CeedCallBackend(CeedVectorRestoreArrayRead(x_ref, &d_x)); + CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); + if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u)); + if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); + CeedCallBackend(CeedDestroy(&ceed)); + return CEED_ERROR_SUCCESS; +} + +static int CeedBasisApplyAtPoints_Hip_shared(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, + CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) { + CeedCallBackend(CeedBasisApplyAtPointsCore_Hip_shared(basis, false, num_elem, num_points, t_mode, eval_mode, x_ref, u, v)); + return CEED_ERROR_SUCCESS; +} + +static int CeedBasisApplyAddAtPoints_Hip_shared(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, + CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) { + CeedCallBackend(CeedBasisApplyAtPointsCore_Hip_shared(basis, true, num_elem, num_points, t_mode, eval_mode, x_ref, u, v)); + return CEED_ERROR_SUCCESS; +} + +//------------------------------------------------------------------------------ +// Apply basis +//------------------------------------------------------------------------------ +static int CeedBasisApplyNonTensorCore_Hip_shared(CeedBasis basis, bool apply_add, const CeedInt num_elem, CeedTransposeMode t_mode, + CeedEvalMode eval_mode, CeedVector u, CeedVector v) { + Ceed ceed; + Ceed_Hip *ceed_Hip; + CeedInt dim, num_comp; + const CeedScalar *d_u; + CeedScalar *d_v; + CeedBasis_Hip_shared *data; + + CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); + CeedCallBackend(CeedGetData(ceed, &ceed_Hip)); + CeedCallBackend(CeedBasisGetData(basis, &data)); + CeedCallBackend(CeedBasisGetDimension(basis, &dim)); + CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); + + // Get read/write access to u, v + if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); + else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); + if (apply_add) { + CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v)); + } else { + CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); + } + + // Apply basis operation + switch (eval_mode) { + case CEED_EVAL_INTERP: { + CeedInt P, Q; + + CeedCheck(data->d_interp_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; interp not set", CeedEvalModes[eval_mode]); + CeedCallBackend(CeedBasisGetNumNodes(basis, &P)); + CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &Q)); + CeedInt thread = CeedIntMax(Q, P); + void *interp_args[] = {(void *)&num_elem, &data->d_interp_1d, &d_u, &d_v}; + + { + CeedInt elems_per_block = 64 * thread > 256 ? 256 / thread : 64; + elems_per_block = elems_per_block > 0 ? elems_per_block : 1; + CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); + CeedInt shared_mem = elems_per_block * thread * sizeof(CeedScalar); + + if (t_mode == CEED_TRANSPOSE) { + CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, NULL, grid, thread, 1, + elems_per_block, shared_mem, interp_args)); + } else { + CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->Interp, NULL, grid, thread, 1, elems_per_block, shared_mem, interp_args)); + } + } + } break; + case CEED_EVAL_GRAD: { + CeedInt P, Q; + + CeedCheck(data->d_grad_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; grad not set", CeedEvalModes[eval_mode]); + CeedCallBackend(CeedBasisGetNumNodes(basis, &P)); + CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &Q)); + CeedInt thread = CeedIntMax(Q, P); + void *grad_args[] = {(void *)&num_elem, &data->d_grad_1d, &d_u, &d_v}; + + { + CeedInt elems_per_block = 64 * thread > 256 ? 256 / thread : 64; + elems_per_block = elems_per_block > 0 ? elems_per_block : 1; + CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); + CeedInt shared_mem = elems_per_block * thread * sizeof(CeedScalar); + + if (t_mode == CEED_TRANSPOSE) { + CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->GradTransposeAdd : data->GradTranspose, NULL, grid, thread, 1, + elems_per_block, shared_mem, grad_args)); + } else { + CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->Grad, NULL, grid, thread, 1, elems_per_block, shared_mem, grad_args)); + } + } + } break; + case CEED_EVAL_WEIGHT: { + CeedInt P, Q; + + CeedCheck(data->d_q_weight_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weights not set", CeedEvalModes[eval_mode]); + CeedCallBackend(CeedBasisGetNumNodes(basis, &P)); + CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &Q)); + CeedInt thread = CeedIntMax(Q, P); + void *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight_1d, &d_v}; + + { + CeedInt elems_per_block = 64 * thread > 256 ? 256 / thread : 64; + elems_per_block = elems_per_block > 0 ? elems_per_block : 1; + const CeedInt grid_size = num_elem / elems_per_block + (num_elem % elems_per_block > 0); + + CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->Weight, grid_size, thread, elems_per_block, 1, weight_args)); + } + } break; + case CEED_EVAL_NONE: /* handled separately below */ + break; + // LCOV_EXCL_START + case CEED_EVAL_DIV: + case CEED_EVAL_CURL: + return CeedError(ceed, CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]); + // LCOV_EXCL_STOP + } + + // Restore vectors, cover CEED_EVAL_NONE + CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); + if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u)); + if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); + CeedCallBackend(CeedDestroy(&ceed)); + return CEED_ERROR_SUCCESS; +} + +int CeedBasisApplyNonTensor_Hip_shared(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, + CeedVector v) { + CeedCallBackend(CeedBasisApplyNonTensorCore_Hip_shared(basis, false, num_elem, t_mode, eval_mode, u, v)); + return CEED_ERROR_SUCCESS; +} + +int CeedBasisApplyAddNonTensor_Hip_shared(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, + CeedVector v) { + CeedCallBackend(CeedBasisApplyNonTensorCore_Hip_shared(basis, true, num_elem, t_mode, eval_mode, u, v)); return CEED_ERROR_SUCCESS; } @@ -254,11 +636,16 @@ static int CeedBasisDestroy_Hip_shared(CeedBasis basis) { CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); CeedCallBackend(CeedBasisGetData(basis, &data)); CeedCallHip(ceed, hipModuleUnload(data->module)); - CeedCallHip(ceed, hipFree(data->d_q_weight_1d)); + if (data->moduleAtPoints) CeedCallHip(ceed, hipModuleUnload(data->moduleAtPoints)); + if (data->d_q_weight_1d) CeedCallHip(ceed, hipFree(data->d_q_weight_1d)); + CeedCallBackend(CeedFree(&data->h_points_per_elem)); + if (data->d_points_per_elem) CeedCallHip(ceed, hipFree(data->d_points_per_elem)); CeedCallHip(ceed, hipFree(data->d_interp_1d)); CeedCallHip(ceed, hipFree(data->d_grad_1d)); CeedCallHip(ceed, hipFree(data->d_collo_grad_1d)); + CeedCallHip(ceed, hipFree(data->d_chebyshev_interp_1d)); CeedCallBackend(CeedFree(&data)); + CeedCallBackend(CeedDestroy(&ceed)); return CEED_ERROR_SUCCESS; } @@ -268,8 +655,6 @@ static int CeedBasisDestroy_Hip_shared(CeedBasis basis) { int CeedBasisCreateTensorH1_Hip_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d, const CeedScalar *q_ref_1d, const CeedScalar *q_weight_1d, CeedBasis basis) { Ceed ceed; - char *basis_kernel_source; - const char *basis_kernel_path; CeedInt num_comp; const CeedInt q_bytes = Q_1d * sizeof(CeedScalar); const CeedInt interp_bytes = q_bytes * P_1d; @@ -279,8 +664,10 @@ int CeedBasisCreateTensorH1_Hip_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, CeedCallBackend(CeedCalloc(1, &data)); // Copy basis data to GPU - CeedCallHip(ceed, hipMalloc((void **)&data->d_q_weight_1d, q_bytes)); - CeedCallHip(ceed, hipMemcpy(data->d_q_weight_1d, q_weight_1d, q_bytes, hipMemcpyHostToDevice)); + if (q_weight_1d) { + CeedCallHip(ceed, hipMalloc((void **)&data->d_q_weight_1d, q_bytes)); + CeedCallHip(ceed, hipMemcpy(data->d_q_weight_1d, q_weight_1d, q_bytes, hipMemcpyHostToDevice)); + } CeedCallHip(ceed, hipMalloc((void **)&data->d_interp_1d, interp_bytes)); CeedCallHip(ceed, hipMemcpy(data->d_interp_1d, interp_1d, interp_bytes, hipMemcpyHostToDevice)); CeedCallHip(ceed, hipMalloc((void **)&data->d_grad_1d, interp_bytes)); @@ -305,28 +692,106 @@ int CeedBasisCreateTensorH1_Hip_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, CeedCallBackend(ComputeBasisThreadBlockSizes(dim, P_1d, Q_1d, num_comp, data->block_sizes)); // Compile basis kernels - CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-shared-basis-tensor.h", &basis_kernel_path)); - CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n"); - CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source)); - CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n"); - CeedCallBackend(CeedCompile_Hip(ceed, basis_kernel_source, &data->module, 11, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "T_1D", + bool is_collocated = false; + const char basis_kernel_source[] = "// Tensor basis source\n#include \n"; + + CeedCallBackend(CeedCompile_Hip(ceed, basis_kernel_source, &data->module, 11, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "BASIS_T_1D", CeedIntMax(Q_1d, P_1d), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp, "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim), "BASIS_INTERP_BLOCK_SIZE", data->block_sizes[0], "BASIS_GRAD_BLOCK_SIZE", data->block_sizes[1], "BASIS_WEIGHT_BLOCK_SIZE", data->block_sizes[2], "BASIS_HAS_COLLOCATED_GRAD", has_collocated_grad)); + CeedCallBackend(CeedBasisIsCollocated(basis, &is_collocated)); + CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, is_collocated ? "InterpCollocated" : "Interp", &data->Interp)); + CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, is_collocated ? "InterpCollocatedTranspose" : "InterpTranspose", &data->InterpTranspose)); + CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, is_collocated ? "InterpCollocatedTransposeAdd" : "InterpTransposeAdd", + &data->InterpTransposeAdd)); + CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, is_collocated ? "GradCollocated" : "Grad", &data->Grad)); + CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, is_collocated ? "GradCollocatedTranspose" : "GradTranspose", &data->GradTranspose)); + CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, is_collocated ? "GradCollocatedTransposeAdd" : "GradTransposeAdd", &data->GradTransposeAdd)); + CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Weight", &data->Weight)); + + CeedCallBackend(CeedBasisSetData(basis, data)); + + // Register backend functions + CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyTensor_Hip_shared)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddTensor_Hip_shared)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAtPoints", CeedBasisApplyAtPoints_Hip_shared)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAddAtPoints", CeedBasisApplyAddAtPoints_Hip_shared)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Hip_shared)); + CeedCallBackend(CeedDestroy(&ceed)); + return CEED_ERROR_SUCCESS; +} + +//------------------------------------------------------------------------------ +// Create non-tensor basis +//------------------------------------------------------------------------------ +int CeedBasisCreateH1_Hip_shared(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, + const CeedScalar *grad, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) { + Ceed ceed; + CeedInt num_comp, q_comp_interp, q_comp_grad; + const CeedInt q_bytes = num_qpts * sizeof(CeedScalar); + CeedBasis_Hip_shared *data; + + CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); + + // Check shared memory size + { + Ceed_Hip *hip_data; + + CeedCallBackend(CeedGetData(ceed, &hip_data)); + if (((size_t)num_nodes * (size_t)num_qpts * (size_t)dim + (size_t)CeedIntMax(num_nodes, num_qpts)) * sizeof(CeedScalar) > + hip_data->device_prop.sharedMemPerBlock) { + CeedCallBackend(CeedBasisCreateH1Fallback(ceed, topo, dim, num_nodes, num_qpts, interp, grad, q_ref, q_weight, basis)); + CeedCallBackend(CeedDestroy(&ceed)); + return CEED_ERROR_SUCCESS; + } + } + + CeedCallBackend(CeedCalloc(1, &data)); + + // Copy basis data to GPU + CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp)); + CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_GRAD, &q_comp_grad)); + if (q_weight) { + CeedCallHip(ceed, hipMalloc((void **)&data->d_q_weight_1d, q_bytes)); + CeedCallHip(ceed, hipMemcpy(data->d_q_weight_1d, q_weight, q_bytes, hipMemcpyHostToDevice)); + } + if (interp) { + const CeedInt interp_bytes = q_bytes * num_nodes * q_comp_interp; + + CeedCallHip(ceed, hipMalloc((void **)&data->d_interp_1d, interp_bytes)); + CeedCallHip(ceed, hipMemcpy(data->d_interp_1d, interp, interp_bytes, hipMemcpyHostToDevice)); + } + if (grad) { + const CeedInt grad_bytes = q_bytes * num_nodes * q_comp_grad; + + CeedCallHip(ceed, hipMalloc((void **)&data->d_grad_1d, grad_bytes)); + CeedCallHip(ceed, hipMemcpy(data->d_grad_1d, grad, grad_bytes, hipMemcpyHostToDevice)); + } + + // Compile basis kernels + const char basis_kernel_source[] = "// Non-tensor basis source\n#include \n"; + + CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); + CeedCallBackend(ComputeBasisThreadBlockSizes(dim, num_nodes, num_qpts, num_comp, data->block_sizes)); + CeedCallBackend(CeedCompile_Hip(ceed, basis_kernel_source, &data->module, 6, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "BASIS_T_1D", + CeedIntMax(num_qpts, num_nodes), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp, "BASIS_INTERP_BLOCK_SIZE", + data->block_sizes[0])); CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Interp", &data->Interp)); CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "InterpTranspose", &data->InterpTranspose)); + CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "InterpTransposeAdd", &data->InterpTransposeAdd)); CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Grad", &data->Grad)); CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "GradTranspose", &data->GradTranspose)); + CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "GradTransposeAdd", &data->GradTransposeAdd)); CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Weight", &data->Weight)); - CeedCallBackend(CeedFree(&basis_kernel_path)); - CeedCallBackend(CeedFree(&basis_kernel_source)); CeedCallBackend(CeedBasisSetData(basis, data)); // Register backend functions - CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyTensor_Hip_shared)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Hip_shared)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Hip_shared)); CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Hip_shared)); + CeedCallBackend(CeedDestroy(&ceed)); return CEED_ERROR_SUCCESS; } diff --git a/backends/hip-shared/ceed-hip-shared.c b/backends/hip-shared/ceed-hip-shared.c index f69c78d4ee..afb39e8bde 100644 --- a/backends/hip-shared/ceed-hip-shared.c +++ b/backends/hip-shared/ceed-hip-shared.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -33,8 +33,10 @@ static int CeedInit_Hip_shared(const char *resource, Ceed ceed) { CeedCallBackend(CeedInit("/gpu/hip/ref", &ceed_ref)); CeedCallBackend(CeedSetDelegate(ceed, ceed_ref)); + CeedCallBackend(CeedDestroy(&ceed_ref)); CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "BasisCreateTensorH1", CeedBasisCreateTensorH1_Hip_shared)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "BasisCreateH1", CeedBasisCreateH1_Hip_shared)); CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "Destroy", CeedDestroy_Hip)); return CEED_ERROR_SUCCESS; } diff --git a/backends/hip-shared/ceed-hip-shared.h b/backends/hip-shared/ceed-hip-shared.h index 6a7c99d048..c534b85e33 100644 --- a/backends/hip-shared/ceed-hip-shared.h +++ b/backends/hip-shared/ceed-hip-shared.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -14,15 +14,32 @@ typedef struct { hipModule_t module; hipFunction_t Interp; hipFunction_t InterpTranspose; + hipFunction_t InterpTransposeAdd; hipFunction_t Grad; hipFunction_t GradTranspose; + hipFunction_t GradTransposeAdd; hipFunction_t Weight; + hipModule_t moduleAtPoints; + CeedInt num_points; + hipFunction_t InterpAtPoints; + hipFunction_t InterpTransposeAtPoints; + hipFunction_t InterpTransposeAddAtPoints; + hipFunction_t GradAtPoints; + hipFunction_t GradTransposeAtPoints; + hipFunction_t GradTransposeAddAtPoints; CeedInt block_sizes[3]; // interp, grad, weight thread block sizes CeedScalar *d_interp_1d; CeedScalar *d_grad_1d; CeedScalar *d_collo_grad_1d; CeedScalar *d_q_weight_1d; + CeedScalar *d_chebyshev_interp_1d; + CeedInt num_elem_at_points; + CeedInt *h_points_per_elem; + CeedInt *d_points_per_elem; } CeedBasis_Hip_shared; CEED_INTERN int CeedBasisCreateTensorH1_Hip_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d, const CeedScalar *q_ref_1d, const CeedScalar *q_weight_1d, CeedBasis basis); + +CEED_INTERN int CeedBasisCreateH1_Hip_shared(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, + const CeedScalar *grad, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis); diff --git a/backends/hip/ceed-hip-common.c b/backends/hip/ceed-hip-common.c index 597aee9037..c33f13b766 100644 --- a/backends/hip/ceed-hip-common.c +++ b/backends/hip/ceed-hip-common.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -19,7 +19,8 @@ int CeedInit_Hip(Ceed ceed, const char *resource) { Ceed_Hip *data; const char *device_spec = strstr(resource, ":device_id="); const int device_id = (device_spec) ? atoi(device_spec + 11) : -1; - int current_device_id; + int current_device_id, xnack_value; + const char *xnack; CeedCallHip(ceed, hipGetDevice(¤t_device_id)); if (device_id >= 0 && current_device_id != device_id) { @@ -30,6 +31,12 @@ int CeedInit_Hip(Ceed ceed, const char *resource) { CeedCallBackend(CeedGetData(ceed, &data)); data->device_id = current_device_id; CeedCallHip(ceed, hipGetDeviceProperties(&data->device_prop, current_device_id)); + xnack = getenv("HSA_XNACK"); + xnack_value = !!xnack ? atol(xnack) : 0; + data->has_unified_addressing = xnack_value > 0 ? data->device_prop.unifiedAddressing : 0; + if (data->has_unified_addressing) { + CeedDebug(ceed, "Using unified memory addressing"); + } data->opt_block_size = 256; return CEED_ERROR_SUCCESS; } @@ -53,10 +60,15 @@ static inline int CeedSetDeviceGenericArray_Hip(Ceed ceed, const void *source_ar void *target_array_owned, void *target_array_borrowed, void *target_array) { switch (copy_mode) { case CEED_COPY_VALUES: - if (!*(void **)target_array_owned) CeedCallHip(ceed, hipMalloc(target_array_owned, size_unit * num_values)); - if (source_array) CeedCallHip(ceed, hipMemcpy(*(void **)target_array_owned, source_array, size_unit * num_values, hipMemcpyDeviceToDevice)); - *(void **)target_array_borrowed = NULL; - *(void **)target_array = *(void **)target_array_owned; + if (!*(void **)target_array) { + if (*(void **)target_array_borrowed) { + *(void **)target_array = *(void **)target_array_borrowed; + } else { + if (!*(void **)target_array_owned) CeedCallHip(ceed, hipMalloc(target_array_owned, size_unit * num_values)); + *(void **)target_array = *(void **)target_array_owned; + } + } + if (source_array) CeedCallHip(ceed, hipMemcpy(*(void **)target_array, source_array, size_unit * num_values, hipMemcpyDeviceToDevice)); break; case CEED_OWN_POINTER: CeedCallHip(ceed, hipFree(*(void **)target_array_owned)); diff --git a/backends/hip/ceed-hip-common.h b/backends/hip/ceed-hip-common.h index c62c392abe..fb89216be5 100644 --- a/backends/hip/ceed-hip-common.h +++ b/backends/hip/ceed-hip-common.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -72,6 +72,7 @@ typedef struct { hipblasHandle_t hipblas_handle; struct hipDeviceProp_t device_prop; int opt_block_size; + int has_unified_addressing; } Ceed_Hip; CEED_INTERN int CeedInit_Hip(Ceed ceed, const char *resource); diff --git a/backends/hip/ceed-hip-compile.cpp b/backends/hip/ceed-hip-compile.cpp index cafb79ed7f..e30bc07a02 100644 --- a/backends/hip/ceed-hip-compile.cpp +++ b/backends/hip/ceed-hip-compile.cpp @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -33,12 +33,13 @@ //------------------------------------------------------------------------------ // Compile HIP kernel //------------------------------------------------------------------------------ -int CeedCompile_Hip(Ceed ceed, const char *source, hipModule_t *module, const CeedInt num_defines, ...) { +static int CeedCompileCore_Hip(Ceed ceed, const char *source, const bool throw_error, bool *is_compile_good, hipModule_t *module, + const CeedInt num_defines, va_list args) { size_t ptx_size; - char *jit_defs_source, *ptx; - const char *jit_defs_path; - const int num_opts = 3; - const char *opts[num_opts]; + char *ptx; + const int num_opts = 4; + CeedInt num_jit_source_dirs = 0, num_jit_defines = 0; + const char **opts; int runtime_version; hiprtcProgram prog; struct hipDeviceProp_t prop; @@ -62,8 +63,6 @@ int CeedCompile_Hip(Ceed ceed, const char *source, hipModule_t *module, const Ce // Kernel specific options, such as kernel constants if (num_defines > 0) { - va_list args; - va_start(args, num_defines); char *name; int val; @@ -72,24 +71,48 @@ int CeedCompile_Hip(Ceed ceed, const char *source, hipModule_t *module, const Ce val = va_arg(args, int); code << "#define " << name << " " << val << "\n"; } - va_end(args); } // Standard libCEED definitions for HIP backends - CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-jit.h", &jit_defs_path)); - CeedCallBackend(CeedLoadSourceToBuffer(ceed, jit_defs_path, &jit_defs_source)); - code << jit_defs_source; - code << "\n\n"; - CeedCallBackend(CeedFree(&jit_defs_path)); - CeedCallBackend(CeedFree(&jit_defs_source)); + code << "#include \n\n"; // Non-macro options + CeedCallBackend(CeedCalloc(num_opts, &opts)); opts[0] = "-default-device"; CeedCallBackend(CeedGetData(ceed, (void **)&ceed_data)); CeedCallHip(ceed, hipGetDeviceProperties(&prop, ceed_data->device_id)); std::string arch_arg = "--gpu-architecture=" + std::string(prop.gcnArchName); opts[1] = arch_arg.c_str(); opts[2] = "-munsafe-fp-atomics"; + opts[3] = "-DCEED_RUNNING_JIT_PASS=1"; + // Additional include dirs + { + const char **jit_source_dirs; + + CeedCallBackend(CeedGetJitSourceRoots(ceed, &num_jit_source_dirs, &jit_source_dirs)); + CeedCallBackend(CeedRealloc(num_opts + num_jit_source_dirs, &opts)); + for (CeedInt i = 0; i < num_jit_source_dirs; i++) { + std::ostringstream include_dir_arg; + + include_dir_arg << "-I" << jit_source_dirs[i]; + CeedCallBackend(CeedStringAllocCopy(include_dir_arg.str().c_str(), (char **)&opts[num_opts + i])); + } + CeedCallBackend(CeedRestoreJitSourceRoots(ceed, &jit_source_dirs)); + } + // User defines + { + const char **jit_defines; + + CeedCallBackend(CeedGetJitDefines(ceed, &num_jit_defines, &jit_defines)); + CeedCallBackend(CeedRealloc(num_opts + num_jit_source_dirs + num_jit_defines, &opts)); + for (CeedInt i = 0; i < num_jit_defines; i++) { + std::ostringstream define_arg; + + define_arg << "-D" << jit_defines[i]; + CeedCallBackend(CeedStringAllocCopy(define_arg.str().c_str(), (char **)&opts[num_opts + num_jit_source_dirs + i])); + } + CeedCallBackend(CeedRestoreJitDefines(ceed, &jit_defines)); + } // Add string source argument provided in call code << source; @@ -98,19 +121,48 @@ int CeedCompile_Hip(Ceed ceed, const char *source, hipModule_t *module, const Ce CeedCallHiprtc(ceed, hiprtcCreateProgram(&prog, code.str().c_str(), NULL, 0, NULL, NULL)); // Compile kernel - hiprtcResult result = hiprtcCompileProgram(prog, num_opts, opts); + CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "---------- ATTEMPTING TO COMPILE JIT SOURCE ----------\n"); + CeedDebug(ceed, "Source:\n%s\n", code.str().c_str()); + CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "---------- END OF JIT SOURCE ----------\n"); + if (CeedDebugFlag(ceed)) { + // LCOV_EXCL_START + CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "---------- JiT COMPILER OPTIONS ----------\n"); + for (CeedInt i = 0; i < num_opts + num_jit_source_dirs + num_jit_defines; i++) { + CeedDebug(ceed, "Option %d: %s", i, opts[i]); + } + CeedDebug(ceed, ""); + CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "---------- END OF JiT COMPILER OPTIONS ----------\n"); + // LCOV_EXCL_STOP + } + hiprtcResult result = hiprtcCompileProgram(prog, num_opts + num_jit_source_dirs + num_jit_defines, opts); - if (result != HIPRTC_SUCCESS) { + for (CeedInt i = 0; i < num_jit_source_dirs; i++) { + CeedCallBackend(CeedFree(&opts[num_opts + i])); + } + for (CeedInt i = 0; i < num_jit_defines; i++) { + CeedCallBackend(CeedFree(&opts[num_opts + num_jit_source_dirs + i])); + } + CeedCallBackend(CeedFree(&opts)); + *is_compile_good = result == HIPRTC_SUCCESS; + if (!*is_compile_good) { size_t log_size; char *log; - CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "---------- CEED JIT SOURCE FAILED TO COMPILE ----------\n"); - CeedDebug(ceed, "Source:\n%s\n", code.str().c_str()); - CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "---------- CEED JIT SOURCE FAILED TO COMPILE ----------\n"); CeedChk_hiprtc(ceed, hiprtcGetProgramLogSize(prog, &log_size)); CeedCallBackend(CeedMalloc(log_size, &log)); CeedCallHiprtc(ceed, hiprtcGetProgramLog(prog, log)); - return CeedError(ceed, CEED_ERROR_BACKEND, "%s\n%s", hiprtcGetErrorString(result), log); + if (throw_error) { + return CeedError(ceed, CEED_ERROR_BACKEND, "%s\n%s", hiprtcGetErrorString(result), log); + } else { + // LCOV_EXCL_START + CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "---------- COMPILE ERROR DETECTED ----------\n"); + CeedDebug(ceed, "Error: %s\nCompile log:\n%s\n", hiprtcGetErrorString(result), log); + CeedDebug256(ceed, CEED_DEBUG_COLOR_WARNING, "---------- BACKEND MAY FALLBACK ----------\n"); + CeedCallBackend(CeedFree(&log)); + CeedCallHiprtc(ceed, hiprtcDestroyProgram(&prog)); + return CEED_ERROR_SUCCESS; + // LCOV_EXCL_STOP + } } CeedCallHiprtc(ceed, hiprtcGetCodeSize(prog, &ptx_size)); @@ -123,6 +175,29 @@ int CeedCompile_Hip(Ceed ceed, const char *source, hipModule_t *module, const Ce return CEED_ERROR_SUCCESS; } +int CeedCompile_Hip(Ceed ceed, const char *source, hipModule_t *module, const CeedInt num_defines, ...) { + bool is_compile_good = true; + va_list args; + + va_start(args, num_defines); + const CeedInt ierr = CeedCompileCore_Hip(ceed, source, true, &is_compile_good, module, num_defines, args); + + va_end(args); + CeedCallBackend(ierr); + return CEED_ERROR_SUCCESS; +} + +int CeedTryCompile_Hip(Ceed ceed, const char *source, bool *is_compile_good, hipModule_t *module, const CeedInt num_defines, ...) { + va_list args; + + va_start(args, num_defines); + const CeedInt ierr = CeedCompileCore_Hip(ceed, source, false, is_compile_good, module, num_defines, args); + + va_end(args); + CeedCallBackend(ierr); + return CEED_ERROR_SUCCESS; +} + //------------------------------------------------------------------------------ // Get HIP kernel //------------------------------------------------------------------------------ @@ -151,9 +226,43 @@ int CeedRunKernelDim_Hip(Ceed ceed, hipFunction_t kernel, const int grid_size, c //------------------------------------------------------------------------------ // Run HIP kernel for spatial dimension with shared memory //------------------------------------------------------------------------------ -int CeedRunKernelDimShared_Hip(Ceed ceed, hipFunction_t kernel, const int grid_size, const int block_size_x, const int block_size_y, - const int block_size_z, const int shared_mem_size, void **args) { - CeedCallHip(ceed, hipModuleLaunchKernel(kernel, grid_size, 1, 1, block_size_x, block_size_y, block_size_z, shared_mem_size, NULL, args, NULL)); +static int CeedRunKernelDimSharedCore_Hip(Ceed ceed, hipFunction_t kernel, hipStream_t stream, const int grid_size, const int block_size_x, + const int block_size_y, const int block_size_z, const int shared_mem_size, const bool throw_error, + bool *is_good_run, void **args) { + hipError_t result = hipModuleLaunchKernel(kernel, grid_size, 1, 1, block_size_x, block_size_y, block_size_z, shared_mem_size, stream, args, NULL); + + if (result == hipSuccess) { + *is_good_run = true; + } else { + if (throw_error) { + CeedCallHip(ceed, result); + } else { + // LCOV_EXCL_START + const char *message = hipGetErrorName(result); + + CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "---------- LAUNCH ERROR DETECTED ----------\n"); + CeedDebug(ceed, "%s\n", message); + CeedDebug256(ceed, CEED_DEBUG_COLOR_WARNING, "---------- BACKEND MAY FALLBACK ----------\n"); + // LCOV_EXCL_STOP + } + *is_good_run = false; + } + return CEED_ERROR_SUCCESS; +} + +int CeedRunKernelDimShared_Hip(Ceed ceed, hipFunction_t kernel, hipStream_t stream, const int grid_size, const int block_size_x, + const int block_size_y, const int block_size_z, const int shared_mem_size, void **args) { + bool is_good_run = true; + + CeedCallBackend(CeedRunKernelDimSharedCore_Hip(ceed, kernel, stream, grid_size, block_size_x, block_size_y, block_size_z, shared_mem_size, true, + &is_good_run, args)); + return CEED_ERROR_SUCCESS; +} + +int CeedTryRunKernelDimShared_Hip(Ceed ceed, hipFunction_t kernel, hipStream_t stream, const int grid_size, const int block_size_x, + const int block_size_y, const int block_size_z, const int shared_mem_size, bool *is_good_run, void **args) { + CeedCallBackend(CeedRunKernelDimSharedCore_Hip(ceed, kernel, stream, grid_size, block_size_x, block_size_y, block_size_z, shared_mem_size, false, + is_good_run, args)); return CEED_ERROR_SUCCESS; } diff --git a/backends/hip/ceed-hip-compile.h b/backends/hip/ceed-hip-compile.h index d990924ec2..dd48fe4cd0 100644 --- a/backends/hip/ceed-hip-compile.h +++ b/backends/hip/ceed-hip-compile.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -13,6 +13,7 @@ static inline CeedInt CeedDivUpInt(CeedInt numerator, CeedInt denominator) { return (numerator + denominator - 1) / denominator; } CEED_INTERN int CeedCompile_Hip(Ceed ceed, const char *source, hipModule_t *module, const CeedInt num_defines, ...); +CEED_INTERN int CeedTryCompile_Hip(Ceed ceed, const char *source, bool *is_compile_good, hipModule_t *module, const CeedInt num_defines, ...); CEED_INTERN int CeedGetKernel_Hip(Ceed ceed, hipModule_t module, const char *name, hipFunction_t *kernel); @@ -21,5 +22,7 @@ CEED_INTERN int CeedRunKernel_Hip(Ceed ceed, hipFunction_t kernel, int grid_size CEED_INTERN int CeedRunKernelDim_Hip(Ceed ceed, hipFunction_t kernel, int grid_size, int block_size_x, int block_size_y, int block_size_z, void **args); -CEED_INTERN int CeedRunKernelDimShared_Hip(Ceed ceed, hipFunction_t kernel, int grid_size, int block_size_x, int block_size_y, int block_size_z, - int shared_mem_size, void **args); +CEED_INTERN int CeedRunKernelDimShared_Hip(Ceed ceed, hipFunction_t kernel, hipStream_t stream, int grid_size, int block_size_x, int block_size_y, + int block_size_z, int shared_mem_size, void **args); +CEED_INTERN int CeedTryRunKernelDimShared_Hip(Ceed ceed, hipFunction_t kernel, hipStream_t stream, int grid_size, int block_size_x, int block_size_y, + int block_size_z, int shared_mem_size, bool *is_good_run, void **args); diff --git a/backends/magma/ceed-magma-basis.c b/backends/magma/ceed-magma-basis.c index 3d00a64100..6ce6ce33b9 100644 --- a/backends/magma/ceed-magma-basis.c +++ b/backends/magma/ceed-magma-basis.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -26,7 +26,8 @@ //------------------------------------------------------------------------------ // Basis apply - tensor //------------------------------------------------------------------------------ -static int CeedBasisApply_Magma(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode e_mode, CeedVector u, CeedVector v) { +static int CeedBasisApplyCore_Magma(CeedBasis basis, bool apply_add, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode e_mode, CeedVector u, + CeedVector v) { Ceed ceed; Ceed_Magma *data; CeedInt dim, num_comp, num_nodes, P_1d, Q_1d, P, Q; @@ -52,7 +53,8 @@ static int CeedBasisApply_Magma(CeedBasis basis, CeedInt num_elem, CeedTranspose // Read vectors if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); else CeedCheck(e_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); - CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); + if (apply_add) CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v)); + else CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); // Apply basis operation switch (e_mode) { @@ -115,9 +117,10 @@ static int CeedBasisApply_Magma(CeedBasis basis, CeedInt num_elem, CeedTranspose void *args[] = {&impl->d_interp_1d, &d_u, &u_elem_stride, &u_comp_stride, &d_v, &v_elem_stride, &v_comp_stride, &num_elem}; if (t_mode == CEED_TRANSPOSE) { - CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->InterpTranspose, grid, num_threads, num_t_col, 1, shared_mem, args)); + CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, apply_add ? impl->InterpTransposeAdd : impl->InterpTranspose, NULL, grid, num_threads, + num_t_col, 1, shared_mem, args)); } else { - CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->Interp, grid, num_threads, num_t_col, 1, shared_mem, args)); + CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->Interp, NULL, grid, num_threads, num_t_col, 1, shared_mem, args)); } } break; case CEED_EVAL_GRAD: { @@ -192,13 +195,15 @@ static int CeedBasisApply_Magma(CeedBasis basis, CeedInt num_elem, CeedTranspose &v_elem_stride, &v_comp_stride, &v_dim_stride, &num_elem}; if (t_mode == CEED_TRANSPOSE) { - CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->GradTranspose, grid, num_threads, num_t_col, 1, shared_mem, args)); + CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, apply_add ? impl->GradTransposeAdd : impl->GradTranspose, NULL, grid, num_threads, + num_t_col, 1, shared_mem, args)); } else { - CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->Grad, grid, num_threads, num_t_col, 1, shared_mem, args)); + CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->Grad, NULL, grid, num_threads, num_t_col, 1, shared_mem, args)); } } break; case CEED_EVAL_WEIGHT: { CeedCheck(t_mode != CEED_TRANSPOSE, ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT incompatible with CEED_TRANSPOSE"); + CeedCheck(impl->d_q_weight_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weight_1d not set", CeedEvalModes[e_mode]); CeedInt elem_dofs_size = CeedIntPow(Q, dim); CeedInt num_threads = 1; CeedInt num_t_col = 1; @@ -225,7 +230,7 @@ static int CeedBasisApply_Magma(CeedBasis basis, CeedInt num_elem, CeedTranspose CeedInt grid = CeedDivUpInt(num_elem, num_t_col); void *args[] = {&impl->d_q_weight_1d, &d_v, &elem_dofs_size, &num_elem}; - CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->Weight, grid, num_threads, num_t_col, 1, shared_mem, args)); + CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->Weight, NULL, grid, num_threads, num_t_col, 1, shared_mem, args)); } break; // LCOV_EXCL_START case CEED_EVAL_DIV: @@ -244,14 +249,33 @@ static int CeedBasisApply_Magma(CeedBasis basis, CeedInt num_elem, CeedTranspose CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); } CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); + CeedCallBackend(CeedDestroy(&ceed)); return CEED_ERROR_SUCCESS; } +static int CeedBasisApply_Magma(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode e_mode, CeedVector u, CeedVector v) { + CeedCallBackend(CeedBasisApplyCore_Magma(basis, false, num_elem, t_mode, e_mode, u, v)); + return CEED_ERROR_SUCCESS; +} + +static int CeedBasisApplyAdd_Magma(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode e_mode, CeedVector u, CeedVector v) { + CeedCallBackend(CeedBasisApplyCore_Magma(basis, true, num_elem, t_mode, e_mode, u, v)); + return CEED_ERROR_SUCCESS; +} + +//------------------------------------------------------------------------------ +// Basis apply - tensor AtPoints +//------------------------------------------------------------------------------ +int CeedBasisApplyAtPoints_Magma(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, CeedEvalMode eval_mode, + CeedVector x_ref, CeedVector u, CeedVector v) { + return CeedError(CeedBasisReturnCeed(basis), CEED_ERROR_BACKEND, "Backend does not implement CeedBasisApplyAtPoints"); +} + //------------------------------------------------------------------------------ // Basis apply - non-tensor //------------------------------------------------------------------------------ -static int CeedBasisApplyNonTensor_Magma(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode e_mode, CeedVector u, - CeedVector v) { +static int CeedBasisApplyNonTensorCore_Magma(CeedBasis basis, bool apply_add, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode e_mode, + CeedVector u, CeedVector v) { Ceed ceed; Ceed_Magma *data; CeedInt num_comp, num_nodes, num_qpts, P, Q, N; @@ -272,7 +296,8 @@ static int CeedBasisApplyNonTensor_Magma(CeedBasis basis, CeedInt num_elem, Ceed // Read vectors if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); else CeedCheck(e_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); - CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); + if (apply_add) CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v)); + else CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); // Compile kernels for N as needed CeedInt iN = 0; @@ -335,16 +360,19 @@ static int CeedBasisApplyNonTensor_Magma(CeedBasis basis, CeedInt num_elem, Ceed impl->NB_deriv_t[iN])); CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[iN], "magma_interp_nontensor_n", &impl->Interp[iN])); CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[iN], "magma_interp_nontensor_t", &impl->InterpTranspose[iN])); + CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[iN], "magma_interp_nontensor_ta", &impl->InterpTransposeAdd[iN])); CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[iN], "magma_deriv_nontensor_n", &impl->Deriv[iN])); CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[iN], "magma_deriv_nontensor_t", &impl->DerivTranspose[iN])); + CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[iN], "magma_deriv_nontensor_ta", &impl->DerivTransposeAdd[iN])); if (!impl->Weight) { CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[iN], "magma_weight_nontensor", &impl->Weight)); CeedCallBackend(CeedFree(&weight_kernel_path)); } CeedCallBackend(CeedFree(&basis_kernel_path)); CeedCallBackend(CeedFree(&basis_kernel_source)); - for (CeedInt i = 0; i < num_file_paths; i++) CeedCall(CeedFree(&file_paths[i])); - CeedCall(CeedFree(&file_paths)); + for (CeedInt i = 0; i < num_file_paths; i++) CeedCallBackend(CeedFree(&file_paths[i])); + CeedCallBackend(CeedFree(&file_paths)); + CeedCallBackend(CeedDestroy(&ceed_delegate)); } } @@ -379,7 +407,7 @@ static int CeedBasisApplyNonTensor_Magma(CeedBasis basis, CeedInt num_elem, Ceed if (P <= MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_P && Q <= MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_Q) { if (e_mode == CEED_EVAL_INTERP) { if (t_mode == CEED_TRANSPOSE) { - Kernel = impl->InterpTranspose[iN]; + Kernel = apply_add ? impl->InterpTransposeAdd[iN] : impl->InterpTranspose[iN]; NB = impl->NB_interp_t[iN]; } else { Kernel = impl->Interp[iN]; @@ -387,7 +415,7 @@ static int CeedBasisApplyNonTensor_Magma(CeedBasis basis, CeedInt num_elem, Ceed } } else { if (t_mode == CEED_TRANSPOSE) { - Kernel = impl->DerivTranspose[iN]; + Kernel = apply_add ? impl->DerivTransposeAdd[iN] : impl->DerivTranspose[iN]; NB = impl->NB_deriv_t[iN]; } else { Kernel = impl->Deriv[iN]; @@ -401,11 +429,11 @@ static int CeedBasisApplyNonTensor_Magma(CeedBasis basis, CeedInt num_elem, Ceed CeedInt shared_mem = (t_mode != CEED_TRANSPOSE && q_comp > 1) ? (shared_mem_A + shared_mem_B) : CeedIntMax(shared_mem_A, shared_mem_B); void *args[] = {&N, &d_b, &d_u, &d_v}; - CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, Kernel, grid, M, num_t_col, 1, shared_mem, args)); + CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, Kernel, NULL, grid, M, num_t_col, 1, shared_mem, args)); } else { for (CeedInt d = 0; d < q_comp; d++) { if (t_mode == CEED_TRANSPOSE) { - const CeedScalar beta = (d > 0) ? 1.0 : 0.0; + const CeedScalar beta = (apply_add || (d > 0)) ? 1.0 : 0.0; magma_gemm_nontensor(MagmaNoTrans, MagmaNoTrans, P, N, Q, 1.0, d_b + d * P * Q, P, d_u + d * N * Q, Q, beta, d_v, P, data->queue); } else { magma_gemm_nontensor(MagmaTrans, MagmaNoTrans, Q, N, P, 1.0, d_b + d * P * Q, P, d_u, P, 0.0, d_v + d * N * Q, Q, data->queue); @@ -414,12 +442,13 @@ static int CeedBasisApplyNonTensor_Magma(CeedBasis basis, CeedInt num_elem, Ceed } } else { CeedCheck(t_mode != CEED_TRANSPOSE, ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT incompatible with CEED_TRANSPOSE"); + CeedCheck(impl->d_q_weight, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weight not set", CeedEvalModes[e_mode]); CeedInt num_t_col = MAGMA_BASIS_NTCOL(Q, MAGMA_MAXTHREADS_1D); CeedInt grid = CeedDivUpInt(num_elem, num_t_col); CeedInt shared_mem = Q * sizeof(CeedScalar) + num_t_col * Q * sizeof(CeedScalar); void *args[] = {&num_elem, &impl->d_q_weight, &d_v}; - CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->Weight, grid, Q, num_t_col, 1, shared_mem, args)); + CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->Weight, NULL, grid, Q, num_t_col, 1, shared_mem, args)); } // Must sync to ensure completeness @@ -430,6 +459,19 @@ static int CeedBasisApplyNonTensor_Magma(CeedBasis basis, CeedInt num_elem, Ceed CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); } CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); + CeedCallBackend(CeedDestroy(&ceed)); + return CEED_ERROR_SUCCESS; +} + +static int CeedBasisApplyNonTensor_Magma(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode e_mode, CeedVector u, + CeedVector v) { + CeedCallBackend(CeedBasisApplyNonTensorCore_Magma(basis, false, num_elem, t_mode, e_mode, u, v)); + return CEED_ERROR_SUCCESS; +} + +static int CeedBasisApplyAddNonTensor_Magma(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode e_mode, CeedVector u, + CeedVector v) { + CeedCallBackend(CeedBasisApplyNonTensorCore_Magma(basis, true, num_elem, t_mode, e_mode, u, v)); return CEED_ERROR_SUCCESS; } @@ -449,8 +491,9 @@ static int CeedBasisDestroy_Magma(CeedBasis basis) { #endif CeedCallBackend(magma_free(impl->d_interp_1d)); CeedCallBackend(magma_free(impl->d_grad_1d)); - CeedCallBackend(magma_free(impl->d_q_weight_1d)); + if (impl->d_q_weight_1d) CeedCallBackend(magma_free(impl->d_q_weight_1d)); CeedCallBackend(CeedFree(&impl)); + CeedCallBackend(CeedDestroy(&ceed)); return CEED_ERROR_SUCCESS; } @@ -476,8 +519,9 @@ static int CeedBasisDestroyNonTensor_Magma(CeedBasis basis) { CeedCallBackend(magma_free(impl->d_grad)); CeedCallBackend(magma_free(impl->d_div)); CeedCallBackend(magma_free(impl->d_curl)); - CeedCallBackend(magma_free(impl->d_q_weight)); + if (impl->d_q_weight) CeedCallBackend(magma_free(impl->d_q_weight)); CeedCallBackend(CeedFree(&impl)); + CeedCallBackend(CeedDestroy(&ceed)); return CEED_ERROR_SUCCESS; } @@ -500,8 +544,10 @@ int CeedBasisCreateTensorH1_Magma(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedCallBackend(CeedCalloc(1, &impl)); // Copy basis data to GPU - CeedCallBackend(magma_malloc((void **)&impl->d_q_weight_1d, Q_1d * sizeof(q_weight_1d[0]))); - magma_setvector(Q_1d, sizeof(q_weight_1d[0]), q_weight_1d, 1, impl->d_q_weight_1d, 1, data->queue); + if (q_weight_1d) { + CeedCallBackend(magma_malloc((void **)&impl->d_q_weight_1d, Q_1d * sizeof(q_weight_1d[0]))); + magma_setvector(Q_1d, sizeof(q_weight_1d[0]), q_weight_1d, 1, impl->d_q_weight_1d, 1, data->queue); + } CeedCallBackend(magma_malloc((void **)&impl->d_interp_1d, Q_1d * P_1d * sizeof(interp_1d[0]))); magma_setvector(Q_1d * P_1d, sizeof(interp_1d[0]), interp_1d, 1, impl->d_interp_1d, 1, data->queue); CeedCallBackend(magma_malloc((void **)&impl->d_grad_1d, Q_1d * P_1d * sizeof(grad_1d[0]))); @@ -547,22 +593,28 @@ int CeedBasisCreateTensorH1_Magma(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const case 1: CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpn_1d_kernel", &impl->Interp)); CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpt_1d_kernel", &impl->InterpTranspose)); + CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpta_1d_kernel", &impl->InterpTransposeAdd)); CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradn_1d_kernel", &impl->Grad)); CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradt_1d_kernel", &impl->GradTranspose)); + CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradta_1d_kernel", &impl->GradTransposeAdd)); CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_weight_1d_kernel", &impl->Weight)); break; case 2: CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpn_2d_kernel", &impl->Interp)); CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpt_2d_kernel", &impl->InterpTranspose)); + CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpta_2d_kernel", &impl->InterpTransposeAdd)); CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradn_2d_kernel", &impl->Grad)); CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradt_2d_kernel", &impl->GradTranspose)); + CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradta_2d_kernel", &impl->GradTransposeAdd)); CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_weight_2d_kernel", &impl->Weight)); break; case 3: CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpn_3d_kernel", &impl->Interp)); CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpt_3d_kernel", &impl->InterpTranspose)); + CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpta_3d_kernel", &impl->InterpTransposeAdd)); CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradn_3d_kernel", &impl->Grad)); CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradt_3d_kernel", &impl->GradTranspose)); + CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradta_3d_kernel", &impl->GradTransposeAdd)); CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_weight_3d_kernel", &impl->Weight)); break; } @@ -570,13 +622,17 @@ int CeedBasisCreateTensorH1_Magma(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedCallBackend(CeedFree(&grad_kernel_path)); CeedCallBackend(CeedFree(&weight_kernel_path)); CeedCallBackend(CeedFree(&basis_kernel_source)); - for (CeedInt i = 0; i < num_file_paths; i++) CeedCall(CeedFree(&file_paths[i])); - CeedCall(CeedFree(&file_paths)); + for (CeedInt i = 0; i < num_file_paths; i++) CeedCallBackend(CeedFree(&file_paths[i])); + CeedCallBackend(CeedFree(&file_paths)); CeedCallBackend(CeedBasisSetData(basis, impl)); CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApply_Magma)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAdd_Magma)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAtPoints", CeedBasisApplyAtPoints_Magma)); CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Magma)); + CeedCallBackend(CeedDestroy(&ceed)); + CeedCallBackend(CeedDestroy(&ceed_delegate)); return CEED_ERROR_SUCCESS; } @@ -594,8 +650,10 @@ int CeedBasisCreateH1_Magma(CeedElemTopology topo, CeedInt dim, CeedInt num_node CeedCallBackend(CeedCalloc(1, &impl)); // Copy basis data to GPU - CeedCallBackend(magma_malloc((void **)&impl->d_q_weight, num_qpts * sizeof(q_weight[0]))); - magma_setvector(num_qpts, sizeof(q_weight[0]), q_weight, 1, impl->d_q_weight, 1, data->queue); + if (q_weight) { + CeedCallBackend(magma_malloc((void **)&impl->d_q_weight, num_qpts * sizeof(q_weight[0]))); + magma_setvector(num_qpts, sizeof(q_weight[0]), q_weight, 1, impl->d_q_weight, 1, data->queue); + } if (interp) { CeedInt q_comp_interp; @@ -629,13 +687,16 @@ int CeedBasisCreateH1_Magma(CeedElemTopology topo, CeedInt dim, CeedInt num_node CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[0], "magma_weight_nontensor", &impl->Weight)); CeedCallBackend(CeedFree(&weight_kernel_path)); CeedCallBackend(CeedFree(&basis_kernel_source)); + CeedCallBackend(CeedDestroy(&ceed_delegate)); } CeedCallBackend(CeedBasisSetData(basis, impl)); // Register backend functions CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Magma)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Magma)); CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Magma)); + CeedCallBackend(CeedDestroy(&ceed)); return CEED_ERROR_SUCCESS; } @@ -653,8 +714,10 @@ int CeedBasisCreateHdiv_Magma(CeedElemTopology topo, CeedInt dim, CeedInt num_no CeedCallBackend(CeedCalloc(1, &impl)); // Copy basis data to GPU - CeedCallBackend(magma_malloc((void **)&impl->d_q_weight, num_qpts * sizeof(q_weight[0]))); - magma_setvector(num_qpts, sizeof(q_weight[0]), q_weight, 1, impl->d_q_weight, 1, data->queue); + if (q_weight) { + CeedCallBackend(magma_malloc((void **)&impl->d_q_weight, num_qpts * sizeof(q_weight[0]))); + magma_setvector(num_qpts, sizeof(q_weight[0]), q_weight, 1, impl->d_q_weight, 1, data->queue); + } if (interp) { CeedInt q_comp_interp; @@ -688,13 +751,16 @@ int CeedBasisCreateHdiv_Magma(CeedElemTopology topo, CeedInt dim, CeedInt num_no CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[0], "magma_weight_nontensor", &impl->Weight)); CeedCallBackend(CeedFree(&weight_kernel_path)); CeedCallBackend(CeedFree(&basis_kernel_source)); + CeedCallBackend(CeedDestroy(&ceed_delegate)); } CeedCallBackend(CeedBasisSetData(basis, impl)); // Register backend functions CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Magma)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Magma)); CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Magma)); + CeedCallBackend(CeedDestroy(&ceed)); return CEED_ERROR_SUCCESS; } @@ -712,8 +778,10 @@ int CeedBasisCreateHcurl_Magma(CeedElemTopology topo, CeedInt dim, CeedInt num_n CeedCallBackend(CeedCalloc(1, &impl)); // Copy basis data to GPU - CeedCallBackend(magma_malloc((void **)&impl->d_q_weight, num_qpts * sizeof(q_weight[0]))); - magma_setvector(num_qpts, sizeof(q_weight[0]), q_weight, 1, impl->d_q_weight, 1, data->queue); + if (q_weight) { + CeedCallBackend(magma_malloc((void **)&impl->d_q_weight, num_qpts * sizeof(q_weight[0]))); + magma_setvector(num_qpts, sizeof(q_weight[0]), q_weight, 1, impl->d_q_weight, 1, data->queue); + } if (interp) { CeedInt q_comp_interp; @@ -747,13 +815,16 @@ int CeedBasisCreateHcurl_Magma(CeedElemTopology topo, CeedInt dim, CeedInt num_n CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[0], "magma_weight_nontensor", &impl->Weight)); CeedCallBackend(CeedFree(&weight_kernel_path)); CeedCallBackend(CeedFree(&basis_kernel_source)); + CeedCallBackend(CeedDestroy(&ceed_delegate)); } CeedCallBackend(CeedBasisSetData(basis, impl)); // Register backend functions CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Magma)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Magma)); CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Magma)); + CeedCallBackend(CeedDestroy(&ceed)); return CEED_ERROR_SUCCESS; } diff --git a/backends/magma/ceed-magma-common.c b/backends/magma/ceed-magma-common.c index 592f216c6f..8e62e36b9c 100644 --- a/backends/magma/ceed-magma-common.c +++ b/backends/magma/ceed-magma-common.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause diff --git a/backends/magma/ceed-magma-common.h b/backends/magma/ceed-magma-common.h index 5ebf9b0d10..83c313390e 100644 --- a/backends/magma/ceed-magma-common.h +++ b/backends/magma/ceed-magma-common.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause diff --git a/backends/magma/ceed-magma-det.c b/backends/magma/ceed-magma-det.c index 9b7125ccda..081cb6e7d9 100644 --- a/backends/magma/ceed-magma-det.c +++ b/backends/magma/ceed-magma-det.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -35,6 +35,7 @@ static int CeedInit_Magma_Det(const char *resource, Ceed ceed) { CeedCallBackend(CeedInit("/gpu/cuda/magma", &ceed_ref)); #endif CeedCallBackend(CeedSetDelegate(ceed, ceed_ref)); + CeedCallBackend(CeedDestroy(&ceed_ref)); CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "Destroy", CeedDestroy_Magma)); return CEED_ERROR_SUCCESS; diff --git a/backends/magma/ceed-magma-gemm-nontensor.cpp b/backends/magma/ceed-magma-gemm-nontensor.cpp index 856b514acb..c43ff9266a 100644 --- a/backends/magma/ceed-magma-gemm-nontensor.cpp +++ b/backends/magma/ceed-magma-gemm-nontensor.cpp @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause diff --git a/backends/magma/ceed-magma-gemm-nontensor.h b/backends/magma/ceed-magma-gemm-nontensor.h index 0431620b83..f7108b07c4 100644 --- a/backends/magma/ceed-magma-gemm-nontensor.h +++ b/backends/magma/ceed-magma-gemm-nontensor.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause diff --git a/backends/magma/ceed-magma-gemm-selector.cpp b/backends/magma/ceed-magma-gemm-selector.cpp index 46f963bca0..193c5ba4f5 100644 --- a/backends/magma/ceed-magma-gemm-selector.cpp +++ b/backends/magma/ceed-magma-gemm-selector.cpp @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause diff --git a/backends/magma/ceed-magma-gemm-selector.h b/backends/magma/ceed-magma-gemm-selector.h index c96c95c169..c199ef7dc2 100644 --- a/backends/magma/ceed-magma-gemm-selector.h +++ b/backends/magma/ceed-magma-gemm-selector.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause diff --git a/backends/magma/ceed-magma.c b/backends/magma/ceed-magma.c index 06254365b9..9908dd55da 100644 --- a/backends/magma/ceed-magma.c +++ b/backends/magma/ceed-magma.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -36,6 +36,7 @@ static int CeedInit_Magma(const char *resource, Ceed ceed) { CeedCallBackend(CeedInit("/gpu/cuda/ref", &ceed_ref)); #endif CeedCallBackend(CeedSetDelegate(ceed, ceed_ref)); + CeedCallBackend(CeedDestroy(&ceed_ref)); CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "BasisCreateTensorH1", CeedBasisCreateTensorH1_Magma)); CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "BasisCreateH1", CeedBasisCreateH1_Magma)); diff --git a/backends/magma/ceed-magma.h b/backends/magma/ceed-magma.h index aa60b37b40..c800f2a6ab 100644 --- a/backends/magma/ceed-magma.h +++ b/backends/magma/ceed-magma.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -47,8 +47,10 @@ typedef struct { CeedMagmaModule module; CeedMagmaFunction Interp; CeedMagmaFunction InterpTranspose; + CeedMagmaFunction InterpTransposeAdd; CeedMagmaFunction Grad; CeedMagmaFunction GradTranspose; + CeedMagmaFunction GradTransposeAdd; CeedMagmaFunction Weight; CeedScalar *d_interp_1d; CeedScalar *d_grad_1d; @@ -59,8 +61,10 @@ typedef struct { CeedMagmaModule module[MAGMA_NONTENSOR_KERNEL_INSTANCES]; CeedMagmaFunction Interp[MAGMA_NONTENSOR_KERNEL_INSTANCES]; CeedMagmaFunction InterpTranspose[MAGMA_NONTENSOR_KERNEL_INSTANCES]; + CeedMagmaFunction InterpTransposeAdd[MAGMA_NONTENSOR_KERNEL_INSTANCES]; CeedMagmaFunction Deriv[MAGMA_NONTENSOR_KERNEL_INSTANCES]; CeedMagmaFunction DerivTranspose[MAGMA_NONTENSOR_KERNEL_INSTANCES]; + CeedMagmaFunction DerivTransposeAdd[MAGMA_NONTENSOR_KERNEL_INSTANCES]; CeedMagmaFunction Weight; CeedInt NB_interp[MAGMA_NONTENSOR_KERNEL_INSTANCES], NB_interp_t[MAGMA_NONTENSOR_KERNEL_INSTANCES]; CeedInt NB_deriv[MAGMA_NONTENSOR_KERNEL_INSTANCES], NB_deriv_t[MAGMA_NONTENSOR_KERNEL_INSTANCES]; diff --git a/backends/magma/tuning/Makefile b/backends/magma/tuning/Makefile index 37cfa194f7..bde10abd6e 100644 --- a/backends/magma/tuning/Makefile +++ b/backends/magma/tuning/Makefile @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors +# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors # All Rights Reserved. See the top-level LICENSE and NOTICE files for details. # # SPDX-License-Identifier: BSD-2-Clause diff --git a/backends/magma/tuning/generate_tuning.py b/backends/magma/tuning/generate_tuning.py index 10a2062881..2e3180ba2f 100644 --- a/backends/magma/tuning/generate_tuning.py +++ b/backends/magma/tuning/generate_tuning.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 -# Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. # All Rights Reserved. See the top-level LICENSE and NOTICE files for details. # # SPDX-License-Identifier: BSD-2-Clause diff --git a/backends/magma/tuning/tuning.cpp b/backends/magma/tuning/tuning.cpp index 7a387c14b6..37f20863ae 100644 --- a/backends/magma/tuning/tuning.cpp +++ b/backends/magma/tuning/tuning.cpp @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause diff --git a/backends/memcheck/ceed-memcheck-blocked.c b/backends/memcheck/ceed-memcheck-blocked.c index 4d9f557af5..009c9e4601 100644 --- a/backends/memcheck/ceed-memcheck-blocked.c +++ b/backends/memcheck/ceed-memcheck-blocked.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -22,6 +22,7 @@ static int CeedInit_Memcheck(const char *resource, Ceed ceed) { // Create reference Ceed that implementation will be dispatched through unless overridden CeedCallBackend(CeedInit("/cpu/self/ref/blocked", &ceed_ref)); CeedCallBackend(CeedSetDelegate(ceed, ceed_ref)); + CeedCallBackend(CeedDestroy(&ceed_ref)); CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "VectorCreate", CeedVectorCreate_Memcheck)); CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "ElemRestrictionCreate", CeedElemRestrictionCreate_Memcheck)); diff --git a/backends/memcheck/ceed-memcheck-qfunction.c b/backends/memcheck/ceed-memcheck-qfunction.c index 7c66e3601a..17d823d4ab 100644 --- a/backends/memcheck/ceed-memcheck-qfunction.c +++ b/backends/memcheck/ceed-memcheck-qfunction.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -17,24 +17,32 @@ // QFunction Apply //------------------------------------------------------------------------------ static int CeedQFunctionApply_Memcheck(CeedQFunction qf, CeedInt Q, CeedVector *U, CeedVector *V) { - Ceed ceed; void *ctx_data = NULL; + int input_block_ids[CEED_FIELD_MAX], output_block_ids[CEED_FIELD_MAX]; CeedInt num_in, num_out; CeedQFunctionUser f = NULL; CeedQFunctionField *output_fields; CeedQFunction_Memcheck *impl; - CeedCallBackend(CeedQFunctionGetCeed(qf, &ceed)); CeedCallBackend(CeedQFunctionGetData(qf, &impl)); CeedCallBackend(CeedQFunctionGetContextData(qf, CEED_MEM_HOST, &ctx_data)); CeedCallBackend(CeedQFunctionGetUserFunction(qf, &f)); CeedCallBackend(CeedQFunctionGetNumArgs(qf, &num_in, &num_out)); - int mem_block_ids[num_out]; - // Get input/output arrays + // Get input arrays for (CeedInt i = 0; i < num_in; i++) { + CeedSize len; + char name[32] = ""; + CeedCallBackend(CeedVectorGetArrayRead(U[i], CEED_MEM_HOST, &impl->inputs[i])); + + CeedCallBackend(CeedVectorGetLength(U[i], &len)); + + snprintf(name, 32, "QFunction input %" CeedInt_FMT, i); + input_block_ids[i] = VALGRIND_CREATE_BLOCK(impl->inputs[i], len, name); } + + // Get output arrays for (CeedInt i = 0; i < num_out; i++) { CeedSize len; char name[32] = ""; @@ -44,8 +52,8 @@ static int CeedQFunctionApply_Memcheck(CeedQFunction qf, CeedInt Q, CeedVector * CeedCallBackend(CeedVectorGetLength(V[i], &len)); VALGRIND_MAKE_MEM_UNDEFINED(impl->outputs[i], len); - snprintf(name, 32, "'QFunction output %" CeedInt_FMT "'", i); - mem_block_ids[i] = VALGRIND_CREATE_BLOCK(impl->outputs[i], len, name); + snprintf(name, 32, "QFunction output %" CeedInt_FMT, i); + output_block_ids[i] = VALGRIND_CREATE_BLOCK(impl->outputs[i], len, name); } // Call user function @@ -54,8 +62,10 @@ static int CeedQFunctionApply_Memcheck(CeedQFunction qf, CeedInt Q, CeedVector * // Restore input arrays for (CeedInt i = 0; i < num_in; i++) { CeedCallBackend(CeedVectorRestoreArrayRead(U[i], &impl->inputs[i])); + VALGRIND_DISCARD(input_block_ids[i]); } - // Check for unset output values + + // Check for unset output values and restore arrays { const char *kernel_name, *kernel_path; @@ -63,17 +73,19 @@ static int CeedQFunctionApply_Memcheck(CeedQFunction qf, CeedInt Q, CeedVector * CeedCallBackend(CeedQFunctionGetKernelName(qf, &kernel_name)); CeedCallBackend(CeedQFunctionGetFields(qf, NULL, NULL, NULL, &output_fields)); for (CeedInt i = 0; i < num_out; i++) { - CeedInt field_size; + const char *field_name; + CeedInt field_size; // Note: need field size because vector may be longer than needed for output CeedCallBackend(CeedQFunctionFieldGetSize(output_fields[i], &field_size)); + CeedCallBackend(CeedQFunctionFieldGetName(output_fields[i], &field_name)); for (CeedSize j = 0; j < field_size * (CeedSize)Q; j++) { - CeedCheck(!isnan(impl->outputs[i][j]), ceed, CEED_ERROR_BACKEND, - "QFunction output %" CeedInt_FMT " entry %" CeedSize_FMT " is NaN after restoring write-only access: %s:%s ", i, j, kernel_path, - kernel_name); + CeedCheck(!isnan(impl->outputs[i][j]), CeedQFunctionReturnCeed(qf), CEED_ERROR_BACKEND, + "QFunction output %" CeedInt_FMT " '%s' entry %" CeedSize_FMT " is NaN after restoring write-only access: %s:%s ", i, field_name, j, + kernel_path, kernel_name); } CeedCallBackend(CeedVectorRestoreArray(V[i], &impl->outputs[i])); - VALGRIND_DISCARD(mem_block_ids[i]); + VALGRIND_DISCARD(output_block_ids[i]); } } CeedCallBackend(CeedQFunctionRestoreContextData(qf, &ctx_data)); @@ -107,6 +119,7 @@ int CeedQFunctionCreate_Memcheck(CeedQFunction qf) { CeedCallBackend(CeedQFunctionSetData(qf, impl)); CeedCallBackend(CeedSetBackendFunction(ceed, "QFunction", qf, "Apply", CeedQFunctionApply_Memcheck)); CeedCallBackend(CeedSetBackendFunction(ceed, "QFunction", qf, "Destroy", CeedQFunctionDestroy_Memcheck)); + CeedCallBackend(CeedDestroy(&ceed)); return CEED_ERROR_SUCCESS; } diff --git a/backends/memcheck/ceed-memcheck-qfunctioncontext.c b/backends/memcheck/ceed-memcheck-qfunctioncontext.c index 4da0d0ee68..01f67802c3 100644 --- a/backends/memcheck/ceed-memcheck-qfunctioncontext.c +++ b/backends/memcheck/ceed-memcheck-qfunctioncontext.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -20,7 +20,7 @@ static int CeedQFunctionContextHasValidData_Memcheck(CeedQFunctionContext ctx, b CeedQFunctionContext_Memcheck *impl; CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl)); - *has_valid_data = impl->data; + *has_valid_data = !!impl->data_allocated; return CEED_ERROR_SUCCESS; } @@ -30,9 +30,10 @@ static int CeedQFunctionContextHasValidData_Memcheck(CeedQFunctionContext ctx, b static int CeedQFunctionContextHasBorrowedDataOfType_Memcheck(CeedQFunctionContext ctx, CeedMemType mem_type, bool *has_borrowed_data_of_type) { CeedQFunctionContext_Memcheck *impl; - CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl)); CeedCheck(mem_type == CEED_MEM_HOST, CeedQFunctionContextReturnCeed(ctx), CEED_ERROR_BACKEND, "Can only set HOST memory for this backend"); - *has_borrowed_data_of_type = impl->data_borrowed; + + CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl)); + *has_borrowed_data_of_type = !!impl->data_borrowed; return CEED_ERROR_SUCCESS; } @@ -43,35 +44,69 @@ static int CeedQFunctionContextSetData_Memcheck(CeedQFunctionContext ctx, CeedMe size_t ctx_size; CeedQFunctionContext_Memcheck *impl; + CeedCheck(mem_type == CEED_MEM_HOST, CeedQFunctionContextReturnCeed(ctx), CEED_ERROR_BACKEND, "Can only set HOST memory for this backend"); + CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl)); CeedCallBackend(CeedQFunctionContextGetContextSize(ctx, &ctx_size)); - CeedCheck(mem_type == CEED_MEM_HOST, CeedQFunctionContextReturnCeed(ctx), CEED_ERROR_BACKEND, "Can only set HOST memory for this backend"); - + // Clear previous owned data buffers + if (impl->data_allocated) { + memset(impl->data_allocated, -42, ctx_size); + VALGRIND_DISCARD(impl->allocated_block_id); + } CeedCallBackend(CeedFree(&impl->data_allocated)); + if (impl->data_owned) { + memset(impl->data_owned, -42, ctx_size); + VALGRIND_DISCARD(impl->owned_block_id); + } CeedCallBackend(CeedFree(&impl->data_owned)); + + // Clear borrowed block id, if present + if (impl->data_borrowed) VALGRIND_DISCARD(impl->borrowed_block_id); + + // Set internal pointers to external buffers switch (copy_mode) { case CEED_COPY_VALUES: - CeedCallBackend(CeedMallocArray(1, ctx_size, &impl->data_owned)); + impl->data_owned = NULL; impl->data_borrowed = NULL; - impl->data = impl->data_owned; - memcpy(impl->data, data, ctx_size); break; case CEED_OWN_POINTER: - impl->data_owned = data; - impl->data_borrowed = NULL; - impl->data = data; + impl->data_owned = data; + impl->data_borrowed = NULL; + impl->owned_block_id = VALGRIND_CREATE_BLOCK(impl->data_owned, ctx_size, "Owned external data buffer"); break; case CEED_USE_POINTER: - impl->data_borrowed = data; - impl->data = data; + impl->data_owned = NULL; + impl->data_borrowed = data; + impl->owned_block_id = VALGRIND_CREATE_BLOCK(impl->data_borrowed, ctx_size, "Borrowed external data buffer"); } - // Copy data to check ctx_size bounds + + // Create internal data buffer CeedCallBackend(CeedMallocArray(1, ctx_size, &impl->data_allocated)); - memcpy(impl->data_allocated, impl->data, ctx_size); - impl->data = impl->data_allocated; - VALGRIND_DISCARD(impl->mem_block_id); - impl->mem_block_id = VALGRIND_CREATE_BLOCK(impl->data, ctx_size, "'QFunction backend context data copy'"); + impl->allocated_block_id = VALGRIND_CREATE_BLOCK(impl->data_allocated, ctx_size, "'Allocated internal context data buffer"); + memcpy(impl->data_allocated, data, ctx_size); + return CEED_ERROR_SUCCESS; +} + +//------------------------------------------------------------------------------ +// Sync data +//------------------------------------------------------------------------------ +static int CeedQFunctionContextSyncData_Memcheck(CeedQFunctionContext ctx, CeedMemType mem_type) { + size_t ctx_size; + CeedQFunctionContext_Memcheck *impl; + + CeedCheck(mem_type == CEED_MEM_HOST, CeedQFunctionContextReturnCeed(ctx), CEED_ERROR_BACKEND, "Can only set HOST memory for this backend"); + + CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl)); + CeedCallBackend(CeedQFunctionContextGetContextSize(ctx, &ctx_size)); + + // Copy internal buffer back to owned or borrowed data buffer + if (impl->data_owned) { + memcpy(impl->data_owned, impl->data_allocated, ctx_size); + } + if (impl->data_borrowed) { + memcpy(impl->data_borrowed, impl->data_allocated, ctx_size); + } return CEED_ERROR_SUCCESS; } @@ -79,16 +114,27 @@ static int CeedQFunctionContextSetData_Memcheck(CeedQFunctionContext ctx, CeedMe // QFunctionContext Take Data //------------------------------------------------------------------------------ static int CeedQFunctionContextTakeData_Memcheck(CeedQFunctionContext ctx, CeedMemType mem_type, void *data) { + size_t ctx_size; CeedQFunctionContext_Memcheck *impl; + CeedCheck(mem_type == CEED_MEM_HOST, CeedQFunctionContextReturnCeed(ctx), CEED_ERROR_BACKEND, "Can only provide HOST memory for this backend"); + CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl)); + CeedCallBackend(CeedQFunctionContextGetContextSize(ctx, &ctx_size)); - CeedCheck(mem_type == CEED_MEM_HOST, CeedQFunctionContextReturnCeed(ctx), CEED_ERROR_BACKEND, "Can only provide HOST memory for this backend"); + // Synchronize memory + CeedCallBackend(CeedQFunctionContextSyncData_Memcheck(ctx, CEED_MEM_HOST)); + // Return borrowed buffer *(void **)data = impl->data_borrowed; impl->data_borrowed = NULL; - impl->data = NULL; - VALGRIND_DISCARD(impl->mem_block_id); + VALGRIND_DISCARD(impl->borrowed_block_id); + + // De-allocate internal memory + if (impl->data_allocated) { + memset(impl->data_allocated, -42, ctx_size); + VALGRIND_DISCARD(impl->allocated_block_id); + } CeedCallBackend(CeedFree(&impl->data_allocated)); return CEED_ERROR_SUCCESS; } @@ -97,13 +143,19 @@ static int CeedQFunctionContextTakeData_Memcheck(CeedQFunctionContext ctx, CeedM // QFunctionContext Get Data //------------------------------------------------------------------------------ static int CeedQFunctionContextGetData_Memcheck(CeedQFunctionContext ctx, CeedMemType mem_type, void *data) { + size_t ctx_size; CeedQFunctionContext_Memcheck *impl; - CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl)); + CeedCheck(mem_type == CEED_MEM_HOST, CeedQFunctionContextReturnCeed(ctx), CEED_ERROR_BACKEND, "Can only set HOST memory for this backend"); - CeedCheck(mem_type == CEED_MEM_HOST, CeedQFunctionContextReturnCeed(ctx), CEED_ERROR_BACKEND, "Can only provide HOST memory for this backend"); + CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl)); + CeedCallBackend(CeedQFunctionContextGetContextSize(ctx, &ctx_size)); - *(void **)data = impl->data; + // Create and return writable buffer + CeedCallBackend(CeedMallocArray(1, ctx_size, &impl->data_writable_copy)); + impl->writable_block_id = VALGRIND_CREATE_BLOCK(impl->data_writable_copy, ctx_size, "Allocated writeable data buffer copy"); + memcpy(impl->data_writable_copy, impl->data_allocated, ctx_size); + *(void **)data = impl->data_writable_copy; return CEED_ERROR_SUCCESS; } @@ -114,13 +166,18 @@ static int CeedQFunctionContextGetDataRead_Memcheck(CeedQFunctionContext ctx, Ce size_t ctx_size; CeedQFunctionContext_Memcheck *impl; + CeedCheck(mem_type == CEED_MEM_HOST, CeedQFunctionContextReturnCeed(ctx), CEED_ERROR_BACKEND, "Can only set HOST memory for this backend"); + CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl)); CeedCallBackend(CeedQFunctionContextGetContextSize(ctx, &ctx_size)); - CeedCallBackend(CeedQFunctionContextGetData_Memcheck(ctx, mem_type, data)); - // Make copy to verify no write occurred - CeedCallBackend(CeedMallocArray(1, ctx_size, &impl->data_read_only_copy)); - memcpy(impl->data_read_only_copy, *(void **)data, ctx_size); + // Create and return read-only buffer + if (!impl->data_read_only_copy) { + CeedCallBackend(CeedMallocArray(1, ctx_size, &impl->data_read_only_copy)); + impl->writable_block_id = VALGRIND_CREATE_BLOCK(impl->data_read_only_copy, ctx_size, "Allocated read-only data buffer copy"); + memcpy(impl->data_read_only_copy, impl->data_allocated, ctx_size); + } + *(void **)data = impl->data_read_only_copy; return CEED_ERROR_SUCCESS; } @@ -134,8 +191,14 @@ static int CeedQFunctionContextRestoreData_Memcheck(CeedQFunctionContext ctx) { CeedCallBackend(CeedQFunctionContextGetContextSize(ctx, &ctx_size)); CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl)); - if (impl->data_borrowed) memcpy(impl->data_borrowed, impl->data, ctx_size); - if (impl->data_owned) memcpy(impl->data_owned, impl->data, ctx_size); + // Copy back to internal buffer and sync + memcpy(impl->data_allocated, impl->data_writable_copy, ctx_size); + CeedCallBackend(CeedQFunctionContextSyncData_Memcheck(ctx, CEED_MEM_HOST)); + + // Invalidate writable buffer + memset(impl->data_writable_copy, -42, ctx_size); + CeedCallBackend(CeedFree(&impl->data_writable_copy)); + VALGRIND_DISCARD(impl->writable_block_id); return CEED_ERROR_SUCCESS; } @@ -149,10 +212,15 @@ static int CeedQFunctionContextRestoreDataRead_Memcheck(CeedQFunctionContext ctx CeedCallBackend(CeedQFunctionContextGetContextSize(ctx, &ctx_size)); CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl)); - CeedCheck(!memcmp(impl->data, impl->data_read_only_copy, ctx_size), CeedQFunctionContextReturnCeed(ctx), CEED_ERROR_BACKEND, - "Context data changed while accessed in read-only mode"); + // Verify no changes made during read-only access + bool is_changed = memcmp(impl->data_allocated, impl->data_read_only_copy, ctx_size); + + CeedCheck(!is_changed, CeedQFunctionContextReturnCeed(ctx), CEED_ERROR_BACKEND, "Context data changed while accessed in read-only mode"); + // Invalidate read-only buffer + memset(impl->data_read_only_copy, -42, ctx_size); CeedCallBackend(CeedFree(&impl->data_read_only_copy)); + VALGRIND_DISCARD(impl->read_only_block_id); return CEED_ERROR_SUCCESS; } @@ -165,15 +233,31 @@ static int CeedQFunctionContextDataDestroy_Memcheck(CeedQFunctionContext ctx) { CeedQFunctionContext_Memcheck *impl; CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl)); - CeedCallBackend(CeedQFunctionContextGetDataDestroy(ctx, &data_destroy_mem_type, &data_destroy_function)); + CeedCallBackend(CeedQFunctionContextGetDataDestroy(ctx, &data_destroy_mem_type, &data_destroy_function)); CeedCheck(data_destroy_mem_type == CEED_MEM_HOST, CeedQFunctionContextReturnCeed(ctx), CEED_ERROR_BACKEND, "Can only destroy HOST memory for this backend"); + // Run user destroy routine if (data_destroy_function) { - CeedCallBackend(data_destroy_function(impl->data_borrowed ? impl->data_borrowed : impl->data_owned)); + bool is_borrowed = !!impl->data_borrowed; + + CeedCallBackend(data_destroy_function(is_borrowed ? impl->data_borrowed : impl->data_owned)); + if (is_borrowed) VALGRIND_DISCARD(impl->borrowed_block_id); + else VALGRIND_DISCARD(impl->owned_block_id); + } + // Free allocations and discard block ids + if (impl->data_allocated) { + CeedCallBackend(CeedFree(&impl->data_allocated)); + VALGRIND_DISCARD(impl->allocated_block_id); + } + if (impl->data_owned) { + CeedCallBackend(CeedFree(&impl->data_owned)); + VALGRIND_DISCARD(impl->owned_block_id); + } + if (impl->data_borrowed) { + VALGRIND_DISCARD(impl->borrowed_block_id); } - CeedCallBackend(CeedFree(&impl->data_allocated)); return CEED_ERROR_SUCCESS; } @@ -183,9 +267,19 @@ static int CeedQFunctionContextDataDestroy_Memcheck(CeedQFunctionContext ctx) { static int CeedQFunctionContextDestroy_Memcheck(CeedQFunctionContext ctx) { CeedQFunctionContext_Memcheck *impl; + // Free allocations and discard block ids CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl)); - CeedCallBackend(CeedFree(&impl->data_allocated)); - CeedCallBackend(CeedFree(&impl->data_owned)); + if (impl->data_allocated) { + CeedCallBackend(CeedFree(&impl->data_allocated)); + VALGRIND_DISCARD(impl->allocated_block_id); + } + if (impl->data_owned) { + CeedCallBackend(CeedFree(&impl->data_owned)); + VALGRIND_DISCARD(impl->owned_block_id); + } + if (impl->data_borrowed) { + VALGRIND_DISCARD(impl->borrowed_block_id); + } CeedCallBackend(CeedFree(&impl)); return CEED_ERROR_SUCCESS; } @@ -208,6 +302,7 @@ int CeedQFunctionContextCreate_Memcheck(CeedQFunctionContext ctx) { CeedCallBackend(CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "RestoreDataRead", CeedQFunctionContextRestoreDataRead_Memcheck)); CeedCallBackend(CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "DataDestroy", CeedQFunctionContextDataDestroy_Memcheck)); CeedCallBackend(CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "Destroy", CeedQFunctionContextDestroy_Memcheck)); + CeedCallBackend(CeedDestroy(&ceed)); CeedCallBackend(CeedCalloc(1, &impl)); CeedCallBackend(CeedQFunctionContextSetBackendData(ctx, impl)); return CEED_ERROR_SUCCESS; diff --git a/backends/memcheck/ceed-memcheck-restriction.c b/backends/memcheck/ceed-memcheck-restriction.c index f2877c3a69..e728d08d17 100644 --- a/backends/memcheck/ceed-memcheck-restriction.c +++ b/backends/memcheck/ceed-memcheck-restriction.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -57,8 +57,8 @@ static inline int CeedElemRestrictionApplyStridedNoTranspose_Memcheck_Core(CeedE // Apply restriction for (CeedSize e = start * block_size; e < stop * block_size; e += block_size) { - CeedPragmaSIMD for (CeedSize k = 0; k < num_comp; k++) { - CeedPragmaSIMD for (CeedSize n = 0; n < elem_size; n++) { + for (CeedSize k = 0; k < num_comp; k++) { + for (CeedSize n = 0; n < elem_size; n++) { CeedPragmaSIMD for (CeedSize j = 0; j < block_size; j++) { vv[e * elem_size * num_comp + (k * elem_size + n) * block_size + j - v_offset] = uu[n * strides[0] + k * strides[1] + CeedIntMin(e + j, num_elem - 1) * (CeedSize)strides[2]]; @@ -78,7 +78,7 @@ static inline int CeedElemRestrictionApplyOffsetNoTranspose_Memcheck_Core(CeedEl CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); for (CeedSize e = start * block_size; e < stop * block_size; e += block_size) { - CeedPragmaSIMD for (CeedSize k = 0; k < num_comp; k++) { + for (CeedSize k = 0; k < num_comp; k++) { CeedPragmaSIMD for (CeedSize i = 0; i < elem_size * block_size; i++) { vv[elem_size * (k * block_size + e * num_comp) + i - v_offset] = uu[impl->offsets[i + e * elem_size] + k * comp_stride]; } @@ -96,7 +96,7 @@ static inline int CeedElemRestrictionApplyOrientedNoTranspose_Memcheck_Core(Ceed CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); for (CeedSize e = start * block_size; e < stop * block_size; e += block_size) { - CeedPragmaSIMD for (CeedSize k = 0; k < num_comp; k++) { + for (CeedSize k = 0; k < num_comp; k++) { CeedPragmaSIMD for (CeedSize i = 0; i < elem_size * block_size; i++) { vv[elem_size * (k * block_size + e * num_comp) + i - v_offset] = uu[impl->offsets[i + e * elem_size] + k * comp_stride] * (impl->orients[i + e * elem_size] ? -1.0 : 1.0); @@ -115,7 +115,7 @@ static inline int CeedElemRestrictionApplyCurlOrientedNoTranspose_Memcheck_Core( CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); for (CeedSize e = start * block_size; e < stop * block_size; e += block_size) { - CeedPragmaSIMD for (CeedSize k = 0; k < num_comp; k++) { + for (CeedSize k = 0; k < num_comp; k++) { CeedSize n = 0; CeedPragmaSIMD for (CeedSize j = 0; j < block_size; j++) { @@ -125,7 +125,7 @@ static inline int CeedElemRestrictionApplyCurlOrientedNoTranspose_Memcheck_Core( uu[impl->offsets[j + (n + 1) * block_size + e * elem_size] + k * comp_stride] * impl->curl_orients[j + (3 * n + 2) * block_size + e * 3 * elem_size]; } - CeedPragmaSIMD for (n = 1; n < elem_size - 1; n++) { + for (n = 1; n < elem_size - 1; n++) { CeedPragmaSIMD for (CeedSize j = 0; j < block_size; j++) { vv[e * elem_size * num_comp + (k * elem_size + n) * block_size + j - v_offset] = uu[impl->offsets[j + (n - 1) * block_size + e * elem_size] + k * comp_stride] * @@ -156,7 +156,7 @@ static inline int CeedElemRestrictionApplyCurlOrientedUnsignedNoTranspose_Memche CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); for (CeedSize e = start * block_size; e < stop * block_size; e += block_size) { - CeedPragmaSIMD for (CeedSize k = 0; k < num_comp; k++) { + for (CeedSize k = 0; k < num_comp; k++) { CeedSize n = 0; CeedPragmaSIMD for (CeedSize j = 0; j < block_size; j++) { @@ -166,7 +166,7 @@ static inline int CeedElemRestrictionApplyCurlOrientedUnsignedNoTranspose_Memche uu[impl->offsets[j + (n + 1) * block_size + e * elem_size] + k * comp_stride] * abs(impl->curl_orients[j + (3 * n + 2) * block_size + e * 3 * elem_size]); } - CeedPragmaSIMD for (n = 1; n < elem_size - 1; n++) { + for (n = 1; n < elem_size - 1; n++) { CeedPragmaSIMD for (CeedSize j = 0; j < block_size; j++) { vv[e * elem_size * num_comp + (k * elem_size + n) * block_size + j - v_offset] = uu[impl->offsets[j + (n - 1) * block_size + e * elem_size] + k * comp_stride] * @@ -203,8 +203,8 @@ static inline int CeedElemRestrictionApplyStridedTranspose_Memcheck_Core(CeedEle // Apply restriction for (CeedSize e = start * block_size; e < stop * block_size; e += block_size) { - CeedPragmaSIMD for (CeedSize k = 0; k < num_comp; k++) { - CeedPragmaSIMD for (CeedSize n = 0; n < elem_size; n++) { + for (CeedSize k = 0; k < num_comp; k++) { + for (CeedSize n = 0; n < elem_size; n++) { CeedPragmaSIMD for (CeedSize j = 0; j < CeedIntMin(block_size, num_elem - e); j++) { vv[n * strides[0] + k * strides[1] + (e + j) * strides[2]] += uu[e * elem_size * num_comp + (k * elem_size + n) * block_size + j - v_offset]; @@ -381,7 +381,7 @@ static inline int CeedElemRestrictionApplyAtPointsInElement_Memcheck_Core(CeedEl } } else { for (CeedSize i = 0; i < num_points; i++) { - for (CeedSize j = 0; j < num_comp; j++) vv[impl->offsets[i + l_vec_offset] * num_comp + j] = uu[j * num_points + i + e_vec_offset]; + for (CeedSize j = 0; j < num_comp; j++) vv[impl->offsets[i + l_vec_offset] * num_comp + j] += uu[j * num_points + i + e_vec_offset]; } } e_vec_offset += num_points * (CeedSize)num_comp; @@ -420,8 +420,8 @@ static inline int CeedElemRestrictionApply_Memcheck_Core(CeedElemRestriction rst // Sum into for transpose mode switch (rstr_type) { case CEED_RESTRICTION_STRIDED: - CeedCallBackend( - CeedElemRestrictionApplyStridedTranspose_Memcheck_Core(rstr, num_comp, block_size, start, stop, num_elem, elem_size, v_offset, uu, vv)); + CeedCallBackend(CeedElemRestrictionApplyStridedTranspose_Memcheck_Core(rstr, num_comp, block_size, start, stop, num_elem, elem_size, v_offset, + uu, vv)); break; case CEED_RESTRICTION_STANDARD: CeedCallBackend(CeedElemRestrictionApplyOffsetTranspose_Memcheck_Core(rstr, num_comp, block_size, comp_stride, start, stop, num_elem, @@ -460,8 +460,8 @@ static inline int CeedElemRestrictionApply_Memcheck_Core(CeedElemRestriction rst // Overwrite for notranspose mode switch (rstr_type) { case CEED_RESTRICTION_STRIDED: - CeedCallBackend( - CeedElemRestrictionApplyStridedNoTranspose_Memcheck_Core(rstr, num_comp, block_size, start, stop, num_elem, elem_size, v_offset, uu, vv)); + CeedCallBackend(CeedElemRestrictionApplyStridedNoTranspose_Memcheck_Core(rstr, num_comp, block_size, start, stop, num_elem, elem_size, + v_offset, uu, vv)); break; case CEED_RESTRICTION_STANDARD: CeedCallBackend(CeedElemRestrictionApplyOffsetNoTranspose_Memcheck_Core(rstr, num_comp, block_size, comp_stride, start, stop, num_elem, @@ -673,6 +673,21 @@ int CeedElemRestrictionCreate_Memcheck(CeedMemType mem_type, CeedCopyMode copy_m } } + // Expand E-vector size for AtPoints + if (rstr_type == CEED_RESTRICTION_POINTS) { + CeedSize max_points = 0, num_points_total = 0; + + for (CeedInt i = 0; i < num_elem; i++) { + CeedInt num_points = offsets[i + 1] - offsets[i]; + + max_points = CeedIntMax(max_points, num_points); + num_points_total += num_points; + } + // -- Increase size for last element + num_points_total += (max_points - (offsets[num_elem] - offsets[num_elem - 1])); + CeedCallBackend(CeedElemRestrictionSetAtPointsEVectorSize(rstr, num_points_total * num_comp)); + } + // Offsets data if (rstr_type != CEED_RESTRICTION_STRIDED) { // Check indices @@ -745,14 +760,15 @@ int CeedElemRestrictionCreate_Memcheck(CeedMemType mem_type, CeedCopyMode copy_m CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "ApplyUnsigned", CeedElemRestrictionApplyUnsigned_Memcheck)); CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "ApplyUnoriented", CeedElemRestrictionApplyUnoriented_Memcheck)); if (rstr_type == CEED_RESTRICTION_POINTS) { - CeedCallBackend( - CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "ApplyAtPointsInElement", CeedElemRestrictionApplyAtPointsInElement_Memcheck)); + CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "ApplyAtPointsInElement", + CeedElemRestrictionApplyAtPointsInElement_Memcheck)); } CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "ApplyBlock", CeedElemRestrictionApplyBlock_Memcheck)); CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "GetOffsets", CeedElemRestrictionGetOffsets_Memcheck)); CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "GetOrientations", CeedElemRestrictionGetOrientations_Memcheck)); CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "GetCurlOrientations", CeedElemRestrictionGetCurlOrientations_Memcheck)); CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "Destroy", CeedElemRestrictionDestroy_Memcheck)); + CeedCallBackend(CeedDestroy(&ceed)); return CEED_ERROR_SUCCESS; } diff --git a/backends/memcheck/ceed-memcheck-serial.c b/backends/memcheck/ceed-memcheck-serial.c index f23a8013e6..a0140fbd75 100644 --- a/backends/memcheck/ceed-memcheck-serial.c +++ b/backends/memcheck/ceed-memcheck-serial.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -23,6 +23,7 @@ static int CeedInit_Memcheck(const char *resource, Ceed ceed) { // Create reference Ceed that implementation will be dispatched through unless overridden CeedCallBackend(CeedInit("/cpu/self/ref/serial", &ceed_ref)); CeedCallBackend(CeedSetDelegate(ceed, ceed_ref)); + CeedCallBackend(CeedDestroy(&ceed_ref)); CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "VectorCreate", CeedVectorCreate_Memcheck)); CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "ElemRestrictionCreate", CeedElemRestrictionCreate_Memcheck)); diff --git a/backends/memcheck/ceed-memcheck-vector.c b/backends/memcheck/ceed-memcheck-vector.c index b12b7ead95..c5dd1fe56d 100644 --- a/backends/memcheck/ceed-memcheck-vector.c +++ b/backends/memcheck/ceed-memcheck-vector.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -7,6 +7,7 @@ #include #include +#include #include #include #include @@ -21,7 +22,7 @@ static int CeedVectorHasValidArray_Memcheck(CeedVector vec, bool *has_valid_arra CeedVector_Memcheck *impl; CeedCallBackend(CeedVectorGetData(vec, &impl)); - *has_valid_array = impl->array; + *has_valid_array = !!impl->array_allocated; return CEED_ERROR_SUCCESS; } @@ -31,9 +32,10 @@ static int CeedVectorHasValidArray_Memcheck(CeedVector vec, bool *has_valid_arra static inline int CeedVectorHasBorrowedArrayOfType_Memcheck(const CeedVector vec, CeedMemType mem_type, bool *has_borrowed_array_of_type) { CeedVector_Memcheck *impl; - CeedCallBackend(CeedVectorGetData(vec, &impl)); CeedCheck(mem_type == CEED_MEM_HOST, CeedVectorReturnCeed(vec), CEED_ERROR_BACKEND, "Can only set HOST memory for this backend"); - *has_borrowed_array_of_type = impl->array_borrowed; + + CeedCallBackend(CeedVectorGetData(vec, &impl)); + *has_borrowed_array_of_type = !!impl->array_borrowed; return CEED_ERROR_SUCCESS; } @@ -44,39 +46,108 @@ static int CeedVectorSetArray_Memcheck(CeedVector vec, CeedMemType mem_type, Cee CeedSize length; CeedVector_Memcheck *impl; + CeedCheck(mem_type == CEED_MEM_HOST, CeedVectorReturnCeed(vec), CEED_ERROR_BACKEND, "Can only set HOST memory for this backend"); + CeedCallBackend(CeedVectorGetData(vec, &impl)); CeedCallBackend(CeedVectorGetLength(vec, &length)); - CeedCheck(mem_type == CEED_MEM_HOST, CeedVectorReturnCeed(vec), CEED_ERROR_BACKEND, "Can only set HOST memory for this backend"); - + // Clear previous owned arrays + if (impl->array_allocated) { + for (CeedSize i = 0; i < length; i++) impl->array_allocated[i] = NAN; + VALGRIND_DISCARD(impl->allocated_block_id); + } CeedCallBackend(CeedFree(&impl->array_allocated)); - CeedCallBackend(CeedFree(&impl->array_owned)); + if (copy_mode != CEED_COPY_VALUES) { + if (impl->array_owned) { + for (CeedSize i = 0; i < length; i++) impl->array_owned[i] = NAN; + VALGRIND_DISCARD(impl->owned_block_id); + } + CeedCallBackend(CeedFree(&impl->array_owned)); + } + + // Clear borrowed block id, if present + if (impl->array_borrowed) VALGRIND_DISCARD(impl->borrowed_block_id); + + // Set internal pointers to external arrays switch (copy_mode) { case CEED_COPY_VALUES: - CeedCallBackend(CeedCalloc(length, &impl->array_owned)); - impl->array_borrowed = NULL; - impl->array = impl->array_owned; - if (array) { - memcpy(impl->array, array, length * sizeof(array[0])); - } else { - for (CeedInt i = 0; i < length; i++) impl->array[i] = NAN; - } + // Nothing to update break; case CEED_OWN_POINTER: impl->array_owned = array; impl->array_borrowed = NULL; - impl->array = array; + impl->owned_block_id = VALGRIND_CREATE_BLOCK(impl->array_owned, length * sizeof(CeedScalar), "Owned external array buffer"); break; case CEED_USE_POINTER: - impl->array_borrowed = array; - impl->array = array; + impl->array_owned = NULL; + impl->array_borrowed = array; + impl->borrowed_block_id = VALGRIND_CREATE_BLOCK(impl->array_borrowed, length * sizeof(CeedScalar), "Borrowed external array buffer"); + break; } - // Copy data to check access + + // Create internal array data buffer CeedCallBackend(CeedCalloc(length, &impl->array_allocated)); - memcpy(impl->array_allocated, impl->array, length * sizeof(array[0])); - impl->array = impl->array_allocated; - VALGRIND_DISCARD(impl->mem_block_id); - impl->mem_block_id = VALGRIND_CREATE_BLOCK(impl->array, length * sizeof(array[0]), "'Vector backend array data copy'"); + impl->allocated_block_id = VALGRIND_CREATE_BLOCK(impl->array_allocated, length * sizeof(CeedScalar), "Allocated internal array buffer"); + if (array) { + memcpy(impl->array_allocated, array, length * sizeof(CeedScalar)); + } else { + for (CeedInt i = 0; i < length; i++) impl->array_allocated[i] = NAN; + } + return CEED_ERROR_SUCCESS; +} + +//------------------------------------------------------------------------------ +// Set internal array to value +//------------------------------------------------------------------------------ +static int CeedVectorSetValue_Memcheck(CeedVector vec, CeedScalar value) { + CeedSize length; + CeedVector_Memcheck *impl; + + CeedCallBackend(CeedVectorGetData(vec, &impl)); + CeedCallBackend(CeedVectorGetLength(vec, &length)); + + if (!impl->array_allocated) CeedCallBackend(CeedVectorSetArray_Memcheck(vec, CEED_MEM_HOST, CEED_COPY_VALUES, NULL)); + assert(impl->array_allocated); + for (CeedSize i = 0; i < length; i++) impl->array_allocated[i] = value; + return CEED_ERROR_SUCCESS; +} + +//------------------------------------------------------------------------------ +// Set internal array to value strided +//------------------------------------------------------------------------------ +static int CeedVectorSetValueStrided_Memcheck(CeedVector vec, CeedSize start, CeedSize stop, CeedSize step, CeedScalar val) { + CeedSize length; + CeedVector_Memcheck *impl; + + CeedCallBackend(CeedVectorGetData(vec, &impl)); + CeedCallBackend(CeedVectorGetLength(vec, &length)); + + if (!impl->array_allocated) CeedCallBackend(CeedVectorSetArray_Memcheck(vec, CEED_MEM_HOST, CEED_COPY_VALUES, NULL)); + assert(impl->array_allocated); + if (stop == -1) stop = length; + for (CeedSize i = start; i < stop; i += step) impl->array_allocated[i] = val; + return CEED_ERROR_SUCCESS; +} + +//------------------------------------------------------------------------------ +// Sync arrays +//------------------------------------------------------------------------------ +static int CeedVectorSyncArray_Memcheck(const CeedVector vec, CeedMemType mem_type) { + CeedSize length; + CeedVector_Memcheck *impl; + + CeedCheck(mem_type == CEED_MEM_HOST, CeedVectorReturnCeed(vec), CEED_ERROR_BACKEND, "Can only provide HOST memory for this backend"); + + CeedCallBackend(CeedVectorGetData(vec, &impl)); + CeedCallBackend(CeedVectorGetLength(vec, &length)); + + // Copy internal buffer back to owned or borrowed array + if (impl->array_owned) { + memcpy(impl->array_owned, impl->array_allocated, length * sizeof(CeedScalar)); + } + if (impl->array_borrowed) { + memcpy(impl->array_borrowed, impl->array_allocated, length * sizeof(CeedScalar)); + } return CEED_ERROR_SUCCESS; } @@ -84,16 +155,27 @@ static int CeedVectorSetArray_Memcheck(CeedVector vec, CeedMemType mem_type, Cee // Vector Take Array //------------------------------------------------------------------------------ static int CeedVectorTakeArray_Memcheck(CeedVector vec, CeedMemType mem_type, CeedScalar **array) { + CeedSize length; CeedVector_Memcheck *impl; + CeedCheck(mem_type == CEED_MEM_HOST, CeedVectorReturnCeed(vec), CEED_ERROR_BACKEND, "Can only provide HOST memory for this backend"); + CeedCallBackend(CeedVectorGetData(vec, &impl)); + CeedCallBackend(CeedVectorGetLength(vec, &length)); - CeedCheck(mem_type == CEED_MEM_HOST, CeedVectorReturnCeed(vec), CEED_ERROR_BACKEND, "Can only provide HOST memory for this backend"); + // Synchronize memory + CeedCallBackend(CeedVectorSyncArray_Memcheck(vec, CEED_MEM_HOST)); + // Return borrowed array (*array) = impl->array_borrowed; impl->array_borrowed = NULL; - impl->array = NULL; - VALGRIND_DISCARD(impl->mem_block_id); + VALGRIND_DISCARD(impl->borrowed_block_id); + + // De-allocate internal memory + if (impl->array_allocated) { + for (CeedSize i = 0; i < length; i++) impl->array_allocated[i] = NAN; + VALGRIND_DISCARD(impl->allocated_block_id); + } CeedCallBackend(CeedFree(&impl->array_allocated)); return CEED_ERROR_SUCCESS; } @@ -102,13 +184,19 @@ static int CeedVectorTakeArray_Memcheck(CeedVector vec, CeedMemType mem_type, Ce // Vector Get Array //------------------------------------------------------------------------------ static int CeedVectorGetArray_Memcheck(CeedVector vec, CeedMemType mem_type, CeedScalar **array) { + CeedSize length; CeedVector_Memcheck *impl; - CeedCallBackend(CeedVectorGetData(vec, &impl)); - CeedCheck(mem_type == CEED_MEM_HOST, CeedVectorReturnCeed(vec), CEED_ERROR_BACKEND, "Can only provide HOST memory for this backend"); - *array = impl->array; + CeedCallBackend(CeedVectorGetData(vec, &impl)); + CeedCallBackend(CeedVectorGetLength(vec, &length)); + + // Create and return writable buffer + CeedCallBackend(CeedCalloc(length, &impl->array_writable_copy)); + impl->writable_block_id = VALGRIND_CREATE_BLOCK(impl->array_writable_copy, length * sizeof(CeedScalar), "Allocated writeable array buffer copy"); + memcpy(impl->array_writable_copy, impl->array_allocated, length * sizeof(CeedScalar)); + *array = impl->array_writable_copy; return CEED_ERROR_SUCCESS; } @@ -119,16 +207,18 @@ static int CeedVectorGetArrayRead_Memcheck(CeedVector vec, CeedMemType mem_type, CeedSize length; CeedVector_Memcheck *impl; + CeedCheck(mem_type == CEED_MEM_HOST, CeedVectorReturnCeed(vec), CEED_ERROR_BACKEND, "Can only provide HOST memory for this backend"); + CeedCallBackend(CeedVectorGetData(vec, &impl)); CeedCallBackend(CeedVectorGetLength(vec, &length)); - CeedCallBackend(CeedVectorGetArray_Memcheck(vec, mem_type, (CeedScalar **)array)); - - // Make copy to verify no write occurred + // Create and return read-only buffer if (!impl->array_read_only_copy) { CeedCallBackend(CeedCalloc(length, &impl->array_read_only_copy)); - memcpy(impl->array_read_only_copy, *array, length * sizeof((*array)[0])); + impl->writable_block_id = VALGRIND_CREATE_BLOCK(impl->array_read_only_copy, length * sizeof(CeedScalar), "Allocated read-only array buffer copy"); + memcpy(impl->array_read_only_copy, impl->array_allocated, length * sizeof(CeedScalar)); } + *array = impl->array_read_only_copy; return CEED_ERROR_SUCCESS; } @@ -139,12 +229,18 @@ static int CeedVectorGetArrayWrite_Memcheck(CeedVector vec, CeedMemType mem_type CeedSize length; CeedVector_Memcheck *impl; + CeedCheck(mem_type == CEED_MEM_HOST, CeedVectorReturnCeed(vec), CEED_ERROR_BACKEND, "Can only provide HOST memory for this backend"); + CeedCallBackend(CeedVectorGetData(vec, &impl)); CeedCallBackend(CeedVectorGetLength(vec, &length)); - // Invalidate data to make sure no read occurs - if (!impl->array) CeedCallBackend(CeedVectorSetArray_Memcheck(vec, mem_type, CEED_COPY_VALUES, NULL)); + // Allocate buffer if necessary + if (!impl->array_allocated) CeedCallBackend(CeedVectorSetArray_Memcheck(vec, mem_type, CEED_COPY_VALUES, NULL)); + + // Get writable buffer CeedCallBackend(CeedVectorGetArray_Memcheck(vec, mem_type, array)); + + // Invalidate array data to prevent accidental reads for (CeedSize i = 0; i < length; i++) (*array)[i] = NAN; impl->is_write_only_access = true; return CEED_ERROR_SUCCESS; @@ -154,27 +250,31 @@ static int CeedVectorGetArrayWrite_Memcheck(CeedVector vec, CeedMemType mem_type // Vector Restore Array //------------------------------------------------------------------------------ static int CeedVectorRestoreArray_Memcheck(CeedVector vec) { - Ceed ceed; CeedSize length; CeedVector_Memcheck *impl; CeedCallBackend(CeedVectorGetData(vec, &impl)); CeedCallBackend(CeedVectorGetLength(vec, &length)); - CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); + // Check for unset entries after write-only access if (impl->is_write_only_access) { for (CeedSize i = 0; i < length; i++) { - if (isnan(impl->array[i])) - CeedDebug256(ceed, CEED_DEBUG_COLOR_WARNING, "WARNING: Vec entry %" CeedSize_FMT " is NaN after restoring write-only access", i); + if (isnan(impl->array_writable_copy[i])) { + CeedDebug256(CeedVectorReturnCeed(vec), CEED_DEBUG_COLOR_WARNING, + "WARNING: Vec entry %" CeedSize_FMT " is NaN after restoring write-only access", i); + } } impl->is_write_only_access = false; } - if (impl->array_borrowed) { - memcpy(impl->array_borrowed, impl->array, length * sizeof(impl->array[0])); - } - if (impl->array_owned) { - memcpy(impl->array_owned, impl->array, length * sizeof(impl->array[0])); - } + + // Copy back to internal buffer and sync + memcpy(impl->array_allocated, impl->array_writable_copy, length * sizeof(CeedScalar)); + CeedCallBackend(CeedVectorSyncArray_Memcheck(vec, CEED_MEM_HOST)); + + // Invalidate writable buffer + for (CeedSize i = 0; i < length; i++) impl->array_writable_copy[i] = NAN; + CeedCallBackend(CeedFree(&impl->array_writable_copy)); + VALGRIND_DISCARD(impl->writable_block_id); return CEED_ERROR_SUCCESS; } @@ -188,10 +288,93 @@ static int CeedVectorRestoreArrayRead_Memcheck(CeedVector vec) { CeedCallBackend(CeedVectorGetData(vec, &impl)); CeedCallBackend(CeedVectorGetLength(vec, &length)); - CeedCheck(!memcmp(impl->array, impl->array_read_only_copy, length * sizeof(impl->array[0])), CeedVectorReturnCeed(vec), CEED_ERROR_BACKEND, - "Array data changed while accessed in read-only mode"); + // Verify no changes made during read-only access + bool is_changed = memcmp(impl->array_allocated, impl->array_read_only_copy, length * sizeof(CeedScalar)); + CeedCheck(!is_changed, CeedVectorReturnCeed(vec), CEED_ERROR_BACKEND, "Array data changed while accessed in read-only mode"); + + // Invalidate read-only buffer + for (CeedSize i = 0; i < length; i++) impl->array_read_only_copy[i] = NAN; CeedCallBackend(CeedFree(&impl->array_read_only_copy)); + VALGRIND_DISCARD(impl->read_only_block_id); + return CEED_ERROR_SUCCESS; +} + +//------------------------------------------------------------------------------ +// Take reciprocal of a vector +//------------------------------------------------------------------------------ +static int CeedVectorReciprocal_Memcheck(CeedVector vec) { + CeedSize length; + CeedVector_Memcheck *impl; + + CeedCallBackend(CeedVectorGetData(vec, &impl)); + CeedCallBackend(CeedVectorGetLength(vec, &length)); + + for (CeedSize i = 0; i < length; i++) { + if (fabs(impl->array_allocated[i]) > CEED_EPSILON) impl->array_allocated[i] = 1. / impl->array_allocated[i]; + } + return CEED_ERROR_SUCCESS; +} + +//------------------------------------------------------------------------------ +// Compute x = alpha x +//------------------------------------------------------------------------------ +static int CeedVectorScale_Memcheck(CeedVector x, CeedScalar alpha) { + CeedSize length; + CeedVector_Memcheck *impl; + + CeedCallBackend(CeedVectorGetData(x, &impl)); + CeedCallBackend(CeedVectorGetLength(x, &length)); + + for (CeedSize i = 0; i < length; i++) impl->array_allocated[i] *= alpha; + return CEED_ERROR_SUCCESS; +} + +//------------------------------------------------------------------------------ +// Compute y = alpha x + y +//------------------------------------------------------------------------------ +static int CeedVectorAXPY_Memcheck(CeedVector y, CeedScalar alpha, CeedVector x) { + CeedSize length; + CeedVector_Memcheck *impl_x, *impl_y; + + CeedCallBackend(CeedVectorGetData(x, &impl_x)); + CeedCallBackend(CeedVectorGetData(y, &impl_y)); + CeedCallBackend(CeedVectorGetLength(y, &length)); + + for (CeedSize i = 0; i < length; i++) impl_y->array_allocated[i] += alpha * impl_x->array_allocated[i]; + return CEED_ERROR_SUCCESS; +} + +//------------------------------------------------------------------------------ +// Compute y = alpha x + beta y +//------------------------------------------------------------------------------ +static int CeedVectorAXPBY_Memcheck(CeedVector y, CeedScalar alpha, CeedScalar beta, CeedVector x) { + CeedSize length; + CeedVector_Memcheck *impl_x, *impl_y; + + CeedCallBackend(CeedVectorGetData(x, &impl_x)); + CeedCallBackend(CeedVectorGetData(y, &impl_y)); + CeedCallBackend(CeedVectorGetLength(y, &length)); + + for (CeedSize i = 0; i < length; i++) impl_y->array_allocated[i] = alpha * impl_x->array_allocated[i] + beta * impl_y->array_allocated[i]; + return CEED_ERROR_SUCCESS; +} + +//------------------------------------------------------------------------------ +// Compute the pointwise multiplication w = x .* y +//------------------------------------------------------------------------------ +static int CeedVectorPointwiseMult_Memcheck(CeedVector w, CeedVector x, CeedVector y) { + CeedSize length; + CeedVector_Memcheck *impl_x, *impl_y, *impl_w; + + CeedCallBackend(CeedVectorGetData(x, &impl_x)); + CeedCallBackend(CeedVectorGetData(y, &impl_y)); + CeedCallBackend(CeedVectorGetData(w, &impl_w)); + CeedCallBackend(CeedVectorGetLength(w, &length)); + + if (!impl_w->array_allocated) CeedCallBackend(CeedVectorSetArray_Memcheck(w, CEED_MEM_HOST, CEED_COPY_VALUES, NULL)); + assert(impl_w->array_allocated); + for (CeedSize i = 0; i < length; i++) impl_w->array_allocated[i] = impl_x->array_allocated[i] * impl_y->array_allocated[i]; return CEED_ERROR_SUCCESS; } @@ -201,10 +384,19 @@ static int CeedVectorRestoreArrayRead_Memcheck(CeedVector vec) { static int CeedVectorDestroy_Memcheck(CeedVector vec) { CeedVector_Memcheck *impl; + // Free allocations and discard block ids CeedCallBackend(CeedVectorGetData(vec, &impl)); - VALGRIND_DISCARD(impl->mem_block_id); - CeedCallBackend(CeedFree(&impl->array_allocated)); - CeedCallBackend(CeedFree(&impl->array_owned)); + if (impl->array_allocated) { + CeedCallBackend(CeedFree(&impl->array_allocated)); + VALGRIND_DISCARD(impl->allocated_block_id); + } + if (impl->array_owned) { + CeedCallBackend(CeedFree(&impl->array_owned)); + VALGRIND_DISCARD(impl->owned_block_id); + } + if (impl->array_borrowed) { + VALGRIND_DISCARD(impl->borrowed_block_id); + } CeedCallBackend(CeedFree(&impl)); return CEED_ERROR_SUCCESS; } @@ -216,20 +408,28 @@ int CeedVectorCreate_Memcheck(CeedSize n, CeedVector vec) { Ceed ceed; CeedVector_Memcheck *impl; - CeedCallBackend(CeedCalloc(1, &impl)); - CeedCallBackend(CeedVectorSetData(vec, impl)); - CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "HasValidArray", CeedVectorHasValidArray_Memcheck)); CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "HasBorrowedArrayOfType", CeedVectorHasBorrowedArrayOfType_Memcheck)); CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "SetArray", CeedVectorSetArray_Memcheck)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "SetValue", CeedVectorSetValue_Memcheck)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "SetValueStrided", CeedVectorSetValueStrided_Memcheck)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "SyncArray", CeedVectorSyncArray_Memcheck)); CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "TakeArray", CeedVectorTakeArray_Memcheck)); CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "GetArray", CeedVectorGetArray_Memcheck)); CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "GetArrayRead", CeedVectorGetArrayRead_Memcheck)); CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "GetArrayWrite", CeedVectorGetArrayWrite_Memcheck)); CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "RestoreArray", CeedVectorRestoreArray_Memcheck)); CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "RestoreArrayRead", CeedVectorRestoreArrayRead_Memcheck)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "Reciprocal", CeedVectorReciprocal_Memcheck)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "Scale", CeedVectorScale_Memcheck)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "AXPY", CeedVectorAXPY_Memcheck)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "AXPBY", CeedVectorAXPBY_Memcheck)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "PointwiseMult", CeedVectorPointwiseMult_Memcheck)); CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "Destroy", CeedVectorDestroy_Memcheck)); + CeedCallBackend(CeedDestroy(&ceed)); + CeedCallBackend(CeedCalloc(1, &impl)); + CeedCallBackend(CeedVectorSetData(vec, impl)); return CEED_ERROR_SUCCESS; } diff --git a/backends/memcheck/ceed-memcheck.h b/backends/memcheck/ceed-memcheck.h index 603597fb1e..49f14e0270 100644 --- a/backends/memcheck/ceed-memcheck.h +++ b/backends/memcheck/ceed-memcheck.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -10,13 +10,22 @@ #include typedef struct { - int mem_block_id; - bool is_write_only_access; - CeedScalar *array; + // Internal array buffer + int allocated_block_id; CeedScalar *array_allocated; + // Owned external array + int owned_block_id; CeedScalar *array_owned; + // Borrowed external array + int borrowed_block_id; CeedScalar *array_borrowed; + // Externally viewable read-only array + int read_only_block_id; CeedScalar *array_read_only_copy; + // Externally viewable writable array + bool is_write_only_access; + int writable_block_id; + CeedScalar *array_writable_copy; } CeedVector_Memcheck; typedef struct { @@ -31,18 +40,27 @@ typedef struct { } CeedElemRestriction_Memcheck; typedef struct { + bool setup_done; const CeedScalar **inputs; CeedScalar **outputs; - bool setup_done; } CeedQFunction_Memcheck; typedef struct { - int mem_block_id; - void *data; + // Internal data buffer + int allocated_block_id; void *data_allocated; + // Owned external data + int owned_block_id; void *data_owned; + // Borrowed external data + int borrowed_block_id; void *data_borrowed; + // Externally viewable read-only data + int read_only_block_id; void *data_read_only_copy; + // Externally viewable writable data + int writable_block_id; + void *data_writable_copy; } CeedQFunctionContext_Memcheck; CEED_INTERN int CeedVectorCreate_Memcheck(CeedSize n, CeedVector vec); diff --git a/backends/occa/ceed-occa-basis.cpp b/backends/occa/ceed-occa-basis.cpp deleted file mode 100644 index 0c33da5453..0000000000 --- a/backends/occa/ceed-occa-basis.cpp +++ /dev/null @@ -1,80 +0,0 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. -// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. -// -// SPDX-License-Identifier: BSD-2-Clause -// -// This file is part of CEED: http://github.com/ceed - -#include "ceed-occa-basis.hpp" - -#include "ceed-occa-tensor-basis.hpp" - -namespace ceed { -namespace occa { -Basis::Basis() : ceedComponentCount(0), dim(0), P(0), Q(0) {} - -Basis::~Basis() {} - -Basis *Basis::getBasis(CeedBasis basis, const bool assertValid) { - if (!basis) { - return NULL; - } - - int ierr; - Basis *basis_ = NULL; - - ierr = CeedBasisGetData(basis, &basis_); - if (assertValid) { - CeedOccaFromChk(ierr); - } - - return basis_; -} - -Basis *Basis::from(CeedBasis basis) { - Basis *basis_ = getBasis(basis); - if (!basis_) { - return NULL; - } - - CeedCallOcca(basis_->setCeedFields(basis)); - - return basis_; -} - -Basis *Basis::from(CeedOperatorField operatorField) { - CeedBasis basis; - CeedCallOcca(CeedOperatorFieldGetBasis(operatorField, &basis)); - return from(basis); -} - -int Basis::setCeedFields(CeedBasis basis) { - CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); - CeedCallBackend(CeedBasisGetNumComponents(basis, &ceedComponentCount)); - - return CEED_ERROR_SUCCESS; -} - -//---[ Ceed Callbacks ]----------- -int Basis::registerCeedFunction(Ceed ceed, CeedBasis basis, const char *fname, ceed::occa::ceedFunction f) { - return CeedSetBackendFunction(ceed, "Basis", basis, fname, f); -} - -int Basis::ceedApply(CeedBasis basis, const CeedInt nelem, CeedTransposeMode tmode, CeedEvalMode emode, CeedVector u, CeedVector v) { - Basis *basis_ = Basis::from(basis); - Vector *U = Vector::from(u); - Vector *V = Vector::from(v); - - if (!basis_) { - return staticCeedError("Incorrect CeedBasis argument: op"); - } - - return basis_->apply(nelem, tmode, emode, U, V); -} - -int Basis::ceedDestroy(CeedBasis basis) { - delete getBasis(basis, false); - return CEED_ERROR_SUCCESS; -} -} // namespace occa -} // namespace ceed diff --git a/backends/occa/ceed-occa-basis.hpp b/backends/occa/ceed-occa-basis.hpp deleted file mode 100644 index 2fe01ec052..0000000000 --- a/backends/occa/ceed-occa-basis.hpp +++ /dev/null @@ -1,53 +0,0 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. -// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. -// -// SPDX-License-Identifier: BSD-2-Clause -// -// This file is part of CEED: http://github.com/ceed - -#ifndef CEED_OCCA_BASIS_HEADER -#define CEED_OCCA_BASIS_HEADER - -#include "ceed-occa-ceed-object.hpp" -#include "ceed-occa-vector.hpp" - -namespace ceed { -namespace occa { -class Basis : public CeedObject { - public: - // Ceed object information - CeedInt ceedComponentCount; - - // Owned information - CeedInt dim; - CeedInt P; - CeedInt Q; - - Basis(); - - virtual ~Basis(); - - static Basis *getBasis(CeedBasis basis, const bool assertValid = true); - - static Basis *from(CeedBasis basis); - static Basis *from(CeedOperatorField operatorField); - - int setCeedFields(CeedBasis basis); - - virtual bool isTensorBasis() const = 0; - - virtual const char *getFunctionSource() const = 0; - - virtual int apply(const CeedInt elementCount, CeedTransposeMode tmode, CeedEvalMode emode, Vector *u, Vector *v) = 0; - - //---[ Ceed Callbacks ]----------- - static int registerCeedFunction(Ceed ceed, CeedBasis basis, const char *fname, ceed::occa::ceedFunction f); - - static int ceedApply(CeedBasis basis, const CeedInt nelem, CeedTransposeMode tmode, CeedEvalMode emode, CeedVector u, CeedVector v); - - static int ceedDestroy(CeedBasis basis); -}; -} // namespace occa -} // namespace ceed - -#endif diff --git a/backends/occa/ceed-occa-ceed-object.cpp b/backends/occa/ceed-occa-ceed-object.cpp deleted file mode 100644 index c6dd28fef2..0000000000 --- a/backends/occa/ceed-occa-ceed-object.cpp +++ /dev/null @@ -1,31 +0,0 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. -// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. -// -// SPDX-License-Identifier: BSD-2-Clause -// -// This file is part of CEED: http://github.com/ceed - -#include "ceed-occa-ceed-object.hpp" - -#include "ceed-occa-context.hpp" - -namespace ceed { -namespace occa { -CeedObject::CeedObject(Ceed ceed_) : ceed(ceed_) {} - -::occa::device CeedObject::getDevice() { - if (!_device.isInitialized()) { - _device = Context::from(ceed)->device; - } - return _device; -} - -bool CeedObject::usingCpuDevice() const { return Context::from(ceed)->usingCpuDevice(); } - -bool CeedObject::usingGpuDevice() const { return Context::from(ceed)->usingGpuDevice(); } - -int CeedObject::ceedError(const std::string &message) const { return CeedError(ceed, CEED_ERROR_BACKEND, message.c_str()); } - -int CeedObject::staticCeedError(const std::string &message) { return CeedError(NULL, CEED_ERROR_BACKEND, message.c_str()); } -} // namespace occa -} // namespace ceed diff --git a/backends/occa/ceed-occa-ceed-object.hpp b/backends/occa/ceed-occa-ceed-object.hpp deleted file mode 100644 index 46235cbad5..0000000000 --- a/backends/occa/ceed-occa-ceed-object.hpp +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. -// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. -// -// SPDX-License-Identifier: BSD-2-Clause -// -// This file is part of CEED: http://github.com/ceed - -#ifndef CEED_OCCA_CEEDOBJECT_HEADER -#define CEED_OCCA_CEEDOBJECT_HEADER - -#include "ceed-occa-context.hpp" - -namespace ceed { -namespace occa { -class CeedObject { - private: - ::occa::device _device; - - public: - Ceed ceed; - - CeedObject(Ceed ceed_ = NULL); - - ::occa::device getDevice(); - - bool usingCpuDevice() const; - bool usingGpuDevice() const; - - int ceedError(const std::string &message) const; - static int staticCeedError(const std::string &message); -}; - -namespace SyncState { -static const int none = 0; -static const int host = (1 << 0); -static const int device = (1 << 1); -static const int all = host | device; -} // namespace SyncState -} // namespace occa -} // namespace ceed - -#endif diff --git a/backends/occa/ceed-occa-context.cpp b/backends/occa/ceed-occa-context.cpp deleted file mode 100644 index 4a705147de..0000000000 --- a/backends/occa/ceed-occa-context.cpp +++ /dev/null @@ -1,32 +0,0 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. -// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. -// -// SPDX-License-Identifier: BSD-2-Clause -// -// This file is part of CEED: http://github.com/ceed - -#include "ceed-occa-context.hpp" - -namespace ceed { -namespace occa { -Context::Context(::occa::device device_) : device(device_) { - const std::string mode = device.mode(); - _usingCpuDevice = (mode == "Serial" || mode == "OpenMP"); - _usingGpuDevice = (mode == "CUDA" || mode == "HIP" || mode == "OpenCL"); -} - -Context *Context::from(Ceed ceed) { - if (!ceed) { - return NULL; - } - - Context *context; - CeedGetData(ceed, (void **)&context); - return context; -} - -bool Context::usingCpuDevice() const { return _usingCpuDevice; } - -bool Context::usingGpuDevice() const { return _usingGpuDevice; } -} // namespace occa -} // namespace ceed diff --git a/backends/occa/ceed-occa-context.hpp b/backends/occa/ceed-occa-context.hpp deleted file mode 100644 index 3e1586082b..0000000000 --- a/backends/occa/ceed-occa-context.hpp +++ /dev/null @@ -1,33 +0,0 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. -// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. -// -// SPDX-License-Identifier: BSD-2-Clause -// -// This file is part of CEED: http://github.com/ceed - -#ifndef CEED_OCCA_CONTEXT_HEADER -#define CEED_OCCA_CONTEXT_HEADER - -#include "ceed-occa-types.hpp" - -namespace ceed { -namespace occa { -class Context { - private: - bool _usingCpuDevice; - bool _usingGpuDevice; - - public: - ::occa::device device; - - Context(::occa::device device_); - - static Context *from(Ceed ceed); - - bool usingCpuDevice() const; - bool usingGpuDevice() const; -}; -} // namespace occa -} // namespace ceed - -#endif diff --git a/backends/occa/ceed-occa-cpu-operator.cpp b/backends/occa/ceed-occa-cpu-operator.cpp deleted file mode 100644 index cf5bd3fe59..0000000000 --- a/backends/occa/ceed-occa-cpu-operator.cpp +++ /dev/null @@ -1,751 +0,0 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. -// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. -// -// SPDX-License-Identifier: BSD-2-Clause -// -// This file is part of CEED: http://github.com/ceed - -#include "ceed-occa-cpu-operator.hpp" - -#include "ceed-occa-elem-restriction.hpp" -#include "ceed-occa-qfunction.hpp" -#include "ceed-occa-qfunctioncontext.hpp" -#include "ceed-occa-simplex-basis.hpp" -#include "ceed-occa-tensor-basis.hpp" - -#define CEED_OCCA_PRINT_KERNEL_HASHES 0 - -namespace ceed { -namespace occa { -CpuOperator::CpuOperator() {} - -CpuOperator::~CpuOperator() {} - -void CpuOperator::setupVectors() { - setupVectors(args.inputCount(), args.opInputs, args.qfInputs, dofInputs); - setupVectors(args.outputCount(), args.opOutputs, args.qfOutputs, dofOutputs); -} - -void CpuOperator::setupVectors(const int fieldCount, OperatorFieldVector &opFields, QFunctionFieldVector &qfFields, VectorVector &vectors) { - for (int i = 0; i < fieldCount; ++i) { - const QFunctionField &qfField = qfFields[i]; - const OperatorField &opField = opFields[i]; - - if (qfField.evalMode == CEED_EVAL_WEIGHT) { - // Weight kernel doesn't use the input - vectors.push_back(NULL); - continue; - } - - int entries; - if (qfField.evalMode == CEED_EVAL_NONE) { - // The output vector stores values at quadrature points - entries = (ceedElementCount * ceedQ * qfField.size); - } else { - // The output vector stores the element dof values - entries = (ceedElementCount * opField.getElementSize() * opField.getComponentCount()); - } - - Vector *dofVector = new Vector(); - dofVector->ceed = ceed; - dofVector->resize(entries); - - vectors.push_back(dofVector); - } -} - -void CpuOperator::freeVectors() { - for (int i = 0; i < args.inputCount(); ++i) { - delete dofInputs[i]; - } - for (int i = 0; i < args.outputCount(); ++i) { - delete dofOutputs[i]; - } - dofInputs.clear(); - dofOutputs.clear(); -} - -void CpuOperator::setupInputs(Vector *in) { - for (int i = 0; i < args.inputCount(); ++i) { - // Weight kernel doesn't use the input vector - if (args.getInputEvalMode(i) == CEED_EVAL_WEIGHT) { - continue; - } - - const OperatorField &opField = args.getOpInput(i); - - Vector *input = opField.usesActiveVector() ? in : opField.vec; - Vector *output = dofInputs[i]; - - opField.elemRestriction->apply(CEED_NOTRANSPOSE, *input, *output); - } -} - -void CpuOperator::setupOutputs(Vector *out) { - for (int i = 0; i < args.outputCount(); ++i) { - // Weight is not supported for output vectors - if (args.getOutputEvalMode(i) == CEED_EVAL_WEIGHT) { - continue; - } - - const OperatorField &opField = args.getOpOutput(i); - - Vector *input = dofOutputs[i]; - Vector *output = opField.usesActiveVector() ? out : opField.vec; - - opField.elemRestriction->apply(CEED_TRANSPOSE, *input, *output); - } -} - -void CpuOperator::applyQFunction() { - if (qfunction->qFunctionContext) { - QFunctionContext *ctx = QFunctionContext::from(qfunction->qFunctionContext); - applyAddKernel.pushArg(ctx->getKernelArg()); - } else { - applyAddKernel.pushArg(::occa::null); - } - applyAddKernel.pushArg(ceedElementCount); - - for (int i = 0; i < args.inputCount(); ++i) { - const bool isInput = true; - pushKernelArgs(dofInputs[i], isInput, i); - } - - for (int i = 0; i < args.outputCount(); ++i) { - const bool isInput = false; - pushKernelArgs(dofOutputs[i], isInput, i); - } - - applyAddKernel.run(); -} - -void CpuOperator::pushKernelArgs(Vector *vec, const bool isInput, const int index) { - const OperatorField &opField = args.getOpField(isInput, index); - const QFunctionField &qfField = args.getQfField(isInput, index); - - if (opField.hasBasis()) { - if (opField.usingTensorBasis()) { - pushTensorBasisKernelArgs(qfField, *((TensorBasis *)opField.basis)); - } else { - pushSimplexBasisKernelArgs(qfField, *((SimplexBasis *)opField.basis)); - } - } - - if (vec) { - if (isInput) { - applyAddKernel.pushArg(vec->getConstKernelArg()); - } else { - applyAddKernel.pushArg(vec->getKernelArg()); - } - } else { - applyAddKernel.pushArg(::occa::null); - } -} - -void CpuOperator::pushTensorBasisKernelArgs(const QFunctionField &qfField, TensorBasis &basis) { - switch (qfField.evalMode) { - case CEED_EVAL_INTERP: { - applyAddKernel.pushArg(basis.interp1D); - break; - } - case CEED_EVAL_GRAD: { - applyAddKernel.pushArg(basis.interp1D); - applyAddKernel.pushArg(basis.grad1D); - break; - } - case CEED_EVAL_WEIGHT: { - applyAddKernel.pushArg(basis.qWeight1D); - break; - } - default: { - } - } -} - -void CpuOperator::pushSimplexBasisKernelArgs(const QFunctionField &qfField, SimplexBasis &basis) { - switch (qfField.evalMode) { - case CEED_EVAL_INTERP: { - applyAddKernel.pushArg(basis.interp); - break; - } - case CEED_EVAL_GRAD: { - applyAddKernel.pushArg(basis.grad); - break; - } - case CEED_EVAL_WEIGHT: { - applyAddKernel.pushArg(basis.qWeight); - break; - } - default: { - } - } -} - -::occa::properties CpuOperator::getKernelProps() { - ::occa::properties props = qfunction->getKernelProps(ceedQ); - - props["defines/OCCA_Q"] = ceedQ; - - return props; -} - -void CpuOperator::applyAdd(Vector *in, Vector *out) { - // Setup helper vectors - setupVectors(); - - // Dof nodes -> local dofs - setupInputs(in); - - // Apply qFunction - applyQFunction(); - - // Local dofs -> dof nodes - setupOutputs(out); - - // Cleanup helper vectors - freeVectors(); -} - -::occa::kernel CpuOperator::buildApplyAddKernel() { - std::stringstream ss; - - addBasisFunctionSource(ss); - - addKernelSource(ss); - - const std::string kernelSource = ss.str(); - - CeedDebug(ceed, kernelSource.c_str()); - - // TODO: Store a kernel per Q - return getDevice().buildKernelFromString(kernelSource, "applyAdd", getKernelProps()); -} - -//---[ Kernel Generation ]-------------------- -void CpuOperator::addBasisFunctionSource(std::stringstream &ss) { - BasisVector sourceBasis; - for (int i = 0; i < args.inputCount(); ++i) { - addBasisIfMissingSource(sourceBasis, args.getOpInput(i).basis); - } - for (int i = 0; i < args.outputCount(); ++i) { - addBasisIfMissingSource(sourceBasis, args.getOpOutput(i).basis); - } - - // Make sure there's a break between past code - ss << std::endl; - - // Add source code for each unique basis function - const int basisCount = (int)sourceBasis.size(); - for (int i = 0; i < basisCount; ++i) { - Basis &basis = *(sourceBasis[i]); - - ss << "// Code generation for basis " << i + 1 << std::endl << "//---[ START ]-------------------------------" << std::endl; - - // Undefine and redefine required variables - if (basis.isTensorBasis()) { - TensorBasis &basisTensor = (TensorBasis &)basis; - ss << "#undef TENSOR_FUNCTION" << std::endl - << "#undef P1D" << std::endl - << "#undef Q1D" << std::endl - << "#define P1D " << basisTensor.P1D << std::endl - << "#define Q1D " << basisTensor.Q1D << std::endl; - } else { - SimplexBasis &basisSimplex = (SimplexBasis &)basis; - ss << "#undef SIMPLEX_FUNCTION" << std::endl - << "#undef DIM" << std::endl - << "#undef P" << std::endl - << "#undef Q" << std::endl - << "#define DIM " << basisSimplex.dim << std::endl - << "#define P " << basisSimplex.P << std::endl - << "#define Q " << basisSimplex.Q << std::endl; - } - - ss << std::endl << basis.getFunctionSource() << std::endl << "//---[ END ]---------------------------------" << std::endl; - } -} - -void CpuOperator::addBasisIfMissingSource(BasisVector &sourceBasis, Basis *basis) { - // Avoid adding duplicate sources which will result in colliding symbol names - - // No basis - if (!basis) { - return; - } - - // Fast enough since we expect a small number of inputs/outputs - const int existingBasisCount = (int)sourceBasis.size(); - for (int i = 0; i < existingBasisCount; ++i) { - Basis *other = sourceBasis[i]; - // They are different basis types so other != basis - if (basis->isTensorBasis() != other->isTensorBasis()) { - continue; - } - - if (basis->dim == other->dim && basis->P == other->P && basis->Q == other->Q) { - // `other` wil generate the same code - return; - } - } - - // Basis didn't match any other existing basis - sourceBasis.push_back(basis); -} - -void CpuOperator::addKernelSource(std::stringstream &ss) { - // Make sure there's a break between past code - ss << std::endl; - - ss << "@kernel void applyAdd(" << std::endl; - - addKernelArgsSource(ss); - - ss << std::endl - << ") {" << std::endl - << " @tile(128, @outer, @inner)" << std::endl - << " for (int element = 0; element < elementCount; ++element) {" << std::endl; - -#if CEED_OCCA_PRINT_KERNEL_HASHES - // Print to see which kernel is being run - ss << " if (element == 0) {" << std::endl - << " printf(\"\\n\\nOperator Kernel: \" OKL_KERNEL_HASH \"\\n\\n\");" << std::endl - << " }" << std::endl; -#endif - - addQuadArraySource(ss); - - ss << std::endl << " // [Start] Transforming inputs to quadrature points" << std::endl; - addInputSetupSource(ss); - ss << " // [End] Transforming inputs to quadrature points" << std::endl << std::endl; - - addQFunctionApplicationSource(ss); - - ss << std::endl << " // [Start] Transforming outputs to quadrature points" << std::endl; - addOutputSetupSource(ss); - ss << " // [End] Transforming outputs to quadrature points" << std::endl; - - ss << " }" << std::endl << "}" << std::endl; -} - -void CpuOperator::addKernelArgsSource(std::stringstream &ss) { - ss << " void *ctx," << std::endl << " const CeedInt elementCount"; - - for (int i = 0; i < args.inputCount(); ++i) { - const bool isInput = true; - addKernelArgSource(ss, isInput, i); - } - for (int i = 0; i < args.outputCount(); ++i) { - const bool isInput = false; - addKernelArgSource(ss, isInput, i); - } -} - -void CpuOperator::addKernelArgSource(std::stringstream &ss, const bool isInput, const int index) { - const OperatorField &opField = args.getOpField(isInput, index); - const QFunctionField &qfField = args.getQfField(isInput, index); - - std::stringstream dimAttribute; - if (opField.hasBasis()) { - ss << ',' << std::endl; - if (opField.usingTensorBasis()) { - addTensorKernelArgSource(ss, isInput, index, opField, qfField, dimAttribute); - } else { - addSimplexKernelArgSource(ss, isInput, index, opField, qfField, dimAttribute); - } - } - - ss << ',' << std::endl; - if (isInput) { - ss << " const CeedScalar *" << dofInputVar(index) << dimAttribute.str(); - } else { - ss << " CeedScalar *" << dofOutputVar(index) << dimAttribute.str(); - } -} - -void CpuOperator::addTensorKernelArgSource(std::stringstream &ss, const bool isInput, const int index, const OperatorField &opField, - const QFunctionField &qfField, std::stringstream &dimAttribute) { - TensorBasis &basis = *((TensorBasis *)opField.basis); - - dimAttribute << " @dim("; - - if (qfField.evalMode == CEED_EVAL_INTERP) { - ss << " const CeedScalar *" << interpVar(isInput, index); - - // @dim(P1D, P1D, BASIS_COMPONENT_COUNT, elementCount) - for (int i = 0; i < basis.dim; ++i) { - dimAttribute << basis.P1D << ", "; - } - dimAttribute << basis.ceedComponentCount << ", elementCount"; - } else if (qfField.evalMode == CEED_EVAL_GRAD) { - ss << " const CeedScalar *" << interpVar(isInput, index) << ',' << std::endl << " const CeedScalar *" << gradVar(isInput, index); - - // @dim(P1D, P1D, BASIS_COMPONENT_COUNT, elementCount) - for (int i = 0; i < basis.dim; ++i) { - dimAttribute << basis.P1D << ", "; - } - dimAttribute << basis.ceedComponentCount << ", elementCount"; - } else if (qfField.evalMode == CEED_EVAL_WEIGHT) { - ss << " const CeedScalar *" << qWeightVar(isInput, index); - - // @dim(Q1D, Q1D, elementCount) - for (int i = 0; i < basis.dim; ++i) { - dimAttribute << basis.Q1D << ", "; - } - dimAttribute << "elementCount"; - } else { - // Clear @dim - dimAttribute.str(""); - return; - } - - dimAttribute << ")"; -} - -void CpuOperator::addSimplexKernelArgSource(std::stringstream &ss, const bool isInput, const int index, const OperatorField &opField, - const QFunctionField &qfField, std::stringstream &dimAttribute) { - SimplexBasis &basis = *((SimplexBasis *)opField.basis); - - dimAttribute << " @dim("; - - if (qfField.evalMode == CEED_EVAL_INTERP) { - ss << " const CeedScalar *" << interpVar(isInput, index); - - // @dim(P, BASIS_COMPONENT_COUNT, elementCount) - dimAttribute << basis.P << ", " << basis.ceedComponentCount << ", elementCount"; - } else if (qfField.evalMode == CEED_EVAL_GRAD) { - ss << " const CeedScalar *" << gradVar(isInput, index); - - // @dim(P, BASIS_COMPONENT_COUNT, elementCount) - dimAttribute << basis.P << ", " << basis.ceedComponentCount << ", elementCount"; - } else if (qfField.evalMode == CEED_EVAL_WEIGHT) { - ss << " const CeedScalar *" << qWeightVar(isInput, index); - - // @dim(Q, elementCount) - dimAttribute << basis.Q << ", " - << "elementCount"; - } else { - // Clear @dim - dimAttribute.str(""); - return; - } - - dimAttribute << ")"; -} - -void CpuOperator::addQuadArraySource(std::stringstream &ss) { - const int inputs = args.inputCount(); - const int outputs = args.outputCount(); - - const std::string quadInput = "quadInput"; - const std::string quadOutput = "quadOutput"; - - ss << " // Store the transformed input quad values" << std::endl; - for (int i = 0; i < inputs; ++i) { - const bool isInput = true; - addSingleQfunctionQuadArraySource(ss, isInput, i, quadInput); - } - - ss << std::endl << " // Store the transformed output quad values" << std::endl; - for (int i = 0; i < outputs; ++i) { - const bool isInput = false; - addSingleQfunctionQuadArraySource(ss, isInput, i, quadOutput); - } - ss << std::endl; - - ss << std::endl << " // Store all input pointers in a single array" << std::endl; - addQfunctionQuadArraySource(ss, true, inputs, quadInput); - - ss << std::endl << " // Store all output pointers in a single array" << std::endl; - addQfunctionQuadArraySource(ss, false, outputs, quadOutput); - - ss << std::endl; -} - -void CpuOperator::addSingleQfunctionQuadArraySource(std::stringstream &ss, const bool isInput, const int index, const std::string &name) { - // Output: - // CeedScalar quadInput0[DIM][COMPONENTS][OCCA_Q]; - // CeedScalar quadInput0[OCCA_Q * SIZE]; - - const OperatorField &opField = args.getOpField(isInput, index); - CeedEvalMode evalMode = args.getEvalMode(isInput, index); - - if (evalMode == CEED_EVAL_GRAD) { - ss << " CeedScalar " << indexedVar(name, index) << "[" << opField.getDim() << "]" - << "[" << opField.getComponentCount() << "]" - << "[OCCA_Q];" << std::endl; - } else if (evalMode == CEED_EVAL_INTERP) { - ss << " CeedScalar " << indexedVar(name, index) << "[" << opField.getComponentCount() << "]" - << "[OCCA_Q];" << std::endl; - } else { - const QFunctionField &qfField = args.getQfField(isInput, index); - - ss << " CeedScalar " << indexedVar(name, index) << "[OCCA_Q * " << qfField.size << "];" << std::endl; - } -} - -void CpuOperator::addQfunctionQuadArraySource(std::stringstream &ss, const bool isInput, const int count, const std::string &name) { - // Output: - // CeedScalar *quadInputs[2] = { - // (CeedScalar*) quadInput0, - // (CeedScalar*) quadInput1 - // }; - - // Add an 's': quadInput -> quadInputs - const std::string arrayName = name + "s"; - - ss << " CeedScalar *" << arrayName << "[" << count << "] = {" << std::endl; - for (int i = 0; i < count; ++i) { - if (i) { - ss << ',' << std::endl; - } - ss << " (CeedScalar*) " << indexedVar(name, i); - } - ss << std::endl << " };" << std::endl; -} - -void CpuOperator::addInputSetupSource(std::stringstream &ss) { - const bool isInput = true; - addBasisApplySource(ss, isInput, args.inputCount()); -} - -void CpuOperator::addOutputSetupSource(std::stringstream &ss) { - const bool isInput = false; - addBasisApplySource(ss, isInput, args.outputCount()); -} - -void CpuOperator::addBasisApplySource(std::stringstream &ss, const bool isInput, const int count) { - for (int i = 0; i < count; ++i) { - CeedEvalMode evalMode = args.getEvalMode(isInput, i); - - if (evalMode == CEED_EVAL_INTERP) { - addInterpSource(ss, isInput, i); - } else if (evalMode == CEED_EVAL_GRAD) { - const bool hasTensorBasis = args.getOpField(isInput, i).usingTensorBasis(); - if (hasTensorBasis) { - addGradTensorSource(ss, isInput, i); - } else { - addGradSimplexSource(ss, isInput, i); - } - } else if (evalMode == CEED_EVAL_WEIGHT) { - addWeightSource(ss, isInput, i); - } else if (evalMode == CEED_EVAL_NONE) { - addCopySource(ss, isInput, i); - } - } -} - -void CpuOperator::addInterpSource(std::stringstream &ss, const bool isInput, const int index) { - const OperatorField &opField = args.getOpField(isInput, index); - const bool usingTensorBasis = opField.usingTensorBasis(); - const int components = opField.getComponentCount(); - const int dim = opField.getDim(); - - const std::string weights = interpVar(isInput, index); - - std::string dimArgs; - if (usingTensorBasis) { - for (int i = 0; i < dim; ++i) { - if (i) { - dimArgs += ", "; - } - dimArgs += '0'; - } - } else { - dimArgs = "0"; - } - - std::string input, output; - if (isInput) { - input = "&" + dofInputVar(index) + "(" + dimArgs + ", component, element)"; - output = "(CeedScalar*) " + indexedVar("quadInput", index) + "[component]"; - } else { - input = "(CeedScalar*) " + indexedVar("quadOutput", index) + "[component]"; - output = "&" + dofOutputVar(index) + "(" + dimArgs + ", component, element)"; - } - - ss << " // Applying interp (" << xputName(isInput) << ": " << index << ")" << std::endl - << " for (int component = 0; component < " << components << "; ++component) {" << std::endl - << " " << elementFunction(isInput, index) << "(" << std::endl - << " " << weights << ',' << std::endl - << " " << input << ',' << std::endl - << " " << output << std::endl - << " );" << std::endl - << " }" << std::endl - << std::endl; -} - -void CpuOperator::addGradTensorSource(std::stringstream &ss, const bool isInput, const int index) { - const OperatorField &opField = args.getOpField(isInput, index); - const int components = opField.getComponentCount(); - const int dim = opField.getDim(); - - const std::string B = interpVar(isInput, index); - const std::string Bx = gradVar(isInput, index); - - std::string dimArgs; - for (int i = 0; i < dim; ++i) { - if (i) { - dimArgs += ", "; - } - dimArgs += '0'; - } - - std::string inputs, outputs; - if (isInput) { - inputs = "&" + dofInputVar(index) + "(" + dimArgs + ", component, element)"; - - for (int i = 0; i < dim; ++i) { - if (i) { - outputs += ",\n "; - } - const std::string iStr = std::to_string(i); - outputs += "(CeedScalar*) " + indexedVar("quadInput", index) + "[" + iStr + "][component]"; - } - } else { - for (int i = 0; i < dim; ++i) { - if (i) { - inputs += ",\n "; - } - const std::string iStr = std::to_string(i); - inputs += "(CeedScalar*) " + indexedVar("quadOutput", index) + "[" + iStr + "][component]"; - } - - outputs = "&" + dofOutputVar(index) + "(" + dimArgs + ", component, element)"; - } - - ss << " // Applying grad-tensor (" << xputName(isInput) << ": " << index << ")" << std::endl - << " for (int component = 0; component < " << components << "; ++component) {" << std::endl - << " " << elementFunction(isInput, index) << "(" << std::endl - << " " << B << ',' << std::endl - << " " << Bx << ',' << std::endl - << " " << inputs << ',' << std::endl - << " " << outputs << std::endl - << " );" << std::endl - << " }" << std::endl - << std::endl; -} - -void CpuOperator::addGradSimplexSource(std::stringstream &ss, const bool isInput, const int index) { - const int components = (args.getOpField(isInput, index).getComponentCount()); - - const std::string weights = gradVar(isInput, index); - - std::string input, output; - if (isInput) { - input = "&" + dofInputVar(index) + "(0, component, element)"; - output = "(CeedScalar*) " + indexedVar("quadInput", index) + "[component]"; - } else { - input = "(CeedScalar*) " + indexedVar("quadOutput", index) + "[component]"; - output = "&" + dofOutputVar(index) + "(0, component, element)"; - } - - ss << " // Applying grad-simplex (" << xputName(isInput) << ": " << index << ")" << std::endl - << " for (int component = 0; component < " << components << "; ++component) {" << std::endl - << " " << elementFunction(isInput, index) << "(" << std::endl - << " " << weights << ',' << std::endl - << " " << input << ',' << std::endl - << " " << output << std::endl - << " );" << std::endl - << " }" << std::endl - << std::endl; -} - -void CpuOperator::addWeightSource(std::stringstream &ss, const bool isInput, const int index) { - const std::string weights = qWeightVar(isInput, index); - - std::string output; - if (isInput) { - // TODO: Can the weight operator handle multiple components? - output = "(CeedScalar*) " + indexedVar("quadInput", index); - } else { - output = "&" + dofOutputVar(index) + "(0, element)"; - } - - ss << " // Applying weight (" << xputName(isInput) << ": " << index << ")" << std::endl - << " " << elementFunction(isInput, index) << "(" << std::endl - << " " << weights << ',' << std::endl - << " " << output << std::endl - << " );" << std::endl - << std::endl; -} - -void CpuOperator::addCopySource(std::stringstream &ss, const bool isInput, const int index) { - const QFunctionField &qfField = args.getQfField(isInput, index); - const std::string size = std::to_string(qfField.size); - - std::string input, output; - if (isInput) { - input += dofInputVar(index) + "[q + (OCCA_Q * (field + element * " + size + "))]"; - output += indexedVar("quadInput", index) + "[q + field * OCCA_Q]"; - } else { - input = indexedVar("quadOutput", index) + "[q + field * OCCA_Q]"; - output = dofOutputVar(index) + "[q + (OCCA_Q * (field + element * " + size + "))]"; - } - - ss << " // Copying source directly (" << xputName(isInput) << ": " << index << ")" << std::endl - << " for (int field = 0; field < " << size << "; ++field) {" << std::endl - << " for (int q = 0; q < OCCA_Q; ++q) {" << std::endl - << " " << output << " = " << input << ";" << std::endl - << " }" << std::endl - << " }" << std::endl - << std::endl; -} - -void CpuOperator::addQFunctionApplicationSource(std::stringstream &ss) { - ss << " // Apply qFunction" << std::endl - << " " << qfunction->qFunctionName << "(ctx, OCCA_Q, quadInputs, quadOutputs);" << std::endl - << std::endl; -} - -// ---[ Variables ]----------------- -std::string CpuOperator::elementFunction(const bool isInput, const int index) { - return fullFieldFunctionName(isInput, args.getOpField(isInput, index), args.getQfField(isInput, index)); -} - -std::string CpuOperator::fieldFunctionName(const QFunctionField &qfField) { - switch (qfField.evalMode) { - case CEED_EVAL_INTERP: - return "interp"; - case CEED_EVAL_GRAD: - return "grad"; - case CEED_EVAL_WEIGHT: - return "weight"; - default: - return "none"; - } -} - -std::string CpuOperator::fullFieldFunctionName(const bool isInput, const OperatorField &opField, const QFunctionField &qfField) { - // Output: - // - tensor_1d_interpElement_Q2_P2 - // - simplex_1d_interpElementTranspose_Q2_P2 - - const bool usingTensorBasis = opField.usingTensorBasis(); - std::stringstream ss; - int dim, Q, P; - - if (usingTensorBasis) { - TensorBasis &basis = *((TensorBasis *)opField.basis); - dim = basis.dim; - Q = basis.Q1D; - P = basis.P1D; - ss << "tensor_"; - } else { - SimplexBasis &basis = *((SimplexBasis *)opField.basis); - dim = basis.dim; - Q = basis.Q; - P = basis.P; - ss << "simplex_"; - } - - ss << dim << "d_" << fieldFunctionName(qfField) << "Element"; - - if (!isInput) { - ss << "Transpose"; - } - - ss << "_Q" << Q << "_P" << P; - - return ss.str(); -} -} // namespace occa -} // namespace ceed diff --git a/backends/occa/ceed-occa-cpu-operator.hpp b/backends/occa/ceed-occa-cpu-operator.hpp deleted file mode 100644 index e7e79b059c..0000000000 --- a/backends/occa/ceed-occa-cpu-operator.hpp +++ /dev/null @@ -1,132 +0,0 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. -// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. -// -// SPDX-License-Identifier: BSD-2-Clause -// -// This file is part of CEED: http://github.com/ceed - -#ifndef CEED_OCCA_CPU_OPERATOR_HEADER -#define CEED_OCCA_CPU_OPERATOR_HEADER - -#include -#include - -#include "ceed-occa-operator.hpp" -#include "ceed-occa-vector.hpp" - -namespace ceed { -namespace occa { -class Basis; -class SimplexBasis; -class TensorBasis; - -class CpuOperator : public Operator { - private: - typedef std::vector VectorVector; - typedef std::vector BasisVector; - - VectorVector dofInputs, dofOutputs; - - public: - CpuOperator(); - - ~CpuOperator(); - - // Setup helper vectors - void setupVectors(); - - void setupVectors(const int fieldCount, OperatorFieldVector &opFields, QFunctionFieldVector &qfFields, VectorVector &vectors); - - void freeVectors(); - - // Restriction operators - void setupInputs(Vector *in); - - void setupOutputs(Vector *out); - - void applyQFunction(); - - // Push arguments for a given field - void pushKernelArgs(Vector *vec, const bool isInput, const int index); - - void pushTensorBasisKernelArgs(const QFunctionField &qfField, TensorBasis &basis); - - void pushSimplexBasisKernelArgs(const QFunctionField &qfField, SimplexBasis &basis); - - // Set props for a given field - ::occa::properties getKernelProps(); - - void applyAdd(Vector *in, Vector *out); - - ::occa::kernel buildApplyAddKernel(); - - //---[ Kernel Generation ]------------------ - void addBasisFunctionSource(std::stringstream &ss); - - void addBasisIfMissingSource(BasisVector &sourceBasis, Basis *basis); - - void addKernelSource(std::stringstream &ss); - - void addKernelArgsSource(std::stringstream &ss); - - void addKernelArgSource(std::stringstream &ss, const bool isInput, const int index); - - void addTensorKernelArgSource(std::stringstream &ss, const bool isInput, const int index, const OperatorField &opField, - const QFunctionField &qfField, std::stringstream &dimAttribute); - - void addSimplexKernelArgSource(std::stringstream &ss, const bool isInput, const int index, const OperatorField &opField, - const QFunctionField &qfField, std::stringstream &dimAttribute); - - void addQuadArraySource(std::stringstream &ss); - - void addSingleQfunctionQuadArraySource(std::stringstream &ss, const bool isInput, const int index, const std::string &name); - - void addQfunctionQuadArraySource(std::stringstream &ss, const bool isInput, const int count, const std::string &name); - - void addInputSetupSource(std::stringstream &ss); - - void addOutputSetupSource(std::stringstream &ss); - - void addBasisApplySource(std::stringstream &ss, const bool isInput, const int count); - - void addInterpSource(std::stringstream &ss, const bool isInput, const int index); - - void addGradTensorSource(std::stringstream &ss, const bool isInput, const int index); - - void addGradSimplexSource(std::stringstream &ss, const bool isInput, const int index); - - void addWeightSource(std::stringstream &ss, const bool isInput, const int index); - - void addCopySource(std::stringstream &ss, const bool isInput, const int index); - - void addQFunctionApplicationSource(std::stringstream &ss); - - // ---[ Variables ]--------------- - inline std::string xputName(const bool isInput) { return isInput ? "input" : "output"; } - - inline std::string indexedVar(const std::string &name, const int index) { return name + std::to_string(index); } - - inline std::string indexedVar(const std::string &name, const bool isInput, const int index) { - return (isInput ? "input" : "output") + std::to_string(index) + "_" + name; - } - - inline std::string dofInputVar(const int index) { return indexedVar("dofInput", index); } - - inline std::string dofOutputVar(const int index) { return indexedVar("dofOutput", index); } - - inline std::string interpVar(const bool isInput, const int index) { return indexedVar("B", isInput, index); } - - inline std::string gradVar(const bool isInput, const int index) { return indexedVar("Bx", isInput, index); } - - inline std::string qWeightVar(const bool isInput, const int index) { return indexedVar("qWeights", isInput, index); } - - std::string elementFunction(const bool isInput, const int index); - - std::string fieldFunctionName(const QFunctionField &qfField); - - std::string fullFieldFunctionName(const bool isInput, const OperatorField &opField, const QFunctionField &qfField); -}; -} // namespace occa -} // namespace ceed - -#endif diff --git a/backends/occa/ceed-occa-elem-restriction.cpp b/backends/occa/ceed-occa-elem-restriction.cpp deleted file mode 100644 index 7bfae3d87f..0000000000 --- a/backends/occa/ceed-occa-elem-restriction.cpp +++ /dev/null @@ -1,372 +0,0 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. -// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. -// -// SPDX-License-Identifier: BSD-2-Clause -// -// This file is part of CEED: http://github.com/ceed - -#include "./ceed-occa-elem-restriction.hpp" - -#include -#include - -#include "./ceed-occa-kernels.hpp" -#include "./ceed-occa-vector.hpp" - -namespace ceed { -namespace occa { -ElemRestriction::ElemRestriction() - : ceedElementCount(0), - ceedElementSize(0), - ceedComponentCount(0), - ceedLVectorSize(0), - ceedNodeStride(0), - ceedComponentStride(0), - ceedElementStride(0), - ceedUnstridedComponentStride(0), - freeHostIndices(true), - hostIndices(NULL), - freeIndices(true) {} - -ElemRestriction::~ElemRestriction() { - if (freeHostIndices) { - CeedFree(&hostIndices); - } - if (freeIndices) { - indices.free(); - } -} - -void ElemRestriction::setup(CeedMemType memType, CeedCopyMode copyMode, const CeedInt *indicesInput) { - if (memType == CEED_MEM_HOST) { - setupFromHostMemory(copyMode, indicesInput); - } else { - setupFromDeviceMemory(copyMode, indicesInput); - } - - setupTransposeIndices(); -} - -void ElemRestriction::setupFromHostMemory(CeedCopyMode copyMode, const CeedInt *indices_h) { - const CeedInt entries = ceedElementCount * ceedElementSize; - - freeHostIndices = (copyMode == CEED_OWN_POINTER || copyMode == CEED_COPY_VALUES); - - if (copyMode != CEED_COPY_VALUES) { - hostIndices = const_cast(indices_h); - } else { - const size_t bytes = entries * sizeof(CeedInt); - hostIndices = (CeedInt *)::malloc(bytes); - std::memcpy(hostIndices, indices_h, bytes); - } - - if (hostIndices) { - indices = getDevice().malloc(entries, hostIndices); - } -} - -void ElemRestriction::setupFromDeviceMemory(CeedCopyMode copyMode, const CeedInt *indices_d) { - ::occa::memory deviceIndices = arrayToMemory(indices_d); - - freeIndices = (copyMode == CEED_OWN_POINTER); - - if (copyMode == CEED_COPY_VALUES) { - indices = deviceIndices.clone(); - } else { - indices = deviceIndices; - } -} - -bool ElemRestriction::usesIndices() { return indices.isInitialized(); } - -void ElemRestriction::setupTransposeIndices() { - if (!usesIndices() || transposeQuadIndices.isInitialized()) { - return; - } - - const CeedInt elementEntryCount = ceedElementCount * ceedElementSize; - - bool *indexIsUsed = new bool[ceedLVectorSize]; - std::memset(indexIsUsed, 0, ceedLVectorSize * sizeof(bool)); - - for (CeedInt i = 0; i < elementEntryCount; ++i) { - indexIsUsed[hostIndices[i]] = true; - } - - CeedInt nodeCount = 0; - for (CeedInt i = 0; i < ceedLVectorSize; ++i) { - nodeCount += indexIsUsed[i]; - } - - const CeedInt dofOffsetCount = nodeCount + 1; - CeedInt *quadIndexToDofOffset = new CeedInt[ceedLVectorSize]; - CeedInt *transposeQuadIndices_h = new CeedInt[nodeCount]; - CeedInt *transposeDofOffsets_h = new CeedInt[dofOffsetCount]; - CeedInt *transposeDofIndices_h = new CeedInt[elementEntryCount]; - - std::memset(transposeDofOffsets_h, 0, dofOffsetCount * sizeof(CeedInt)); - - // Compute ids - CeedInt offsetId = 0; - for (CeedInt i = 0; i < ceedLVectorSize; ++i) { - if (indexIsUsed[i]) { - transposeQuadIndices_h[offsetId] = i; - quadIndexToDofOffset[i] = offsetId++; - } - } - - // Count how many times a specific quad node is used - for (CeedInt i = 0; i < elementEntryCount; ++i) { - ++transposeDofOffsets_h[quadIndexToDofOffset[hostIndices[i]] + 1]; - } - - // Aggregate to find true offsets - for (CeedInt i = 1; i < dofOffsetCount; ++i) { - transposeDofOffsets_h[i] += transposeDofOffsets_h[i - 1]; - } - - // Compute dof indices - for (CeedInt i = 0; i < elementEntryCount; ++i) { - const CeedInt quadIndex = hostIndices[i]; - const CeedInt dofIndex = transposeDofOffsets_h[quadIndexToDofOffset[quadIndex]]++; - transposeDofIndices_h[dofIndex] = i; - } - - // Reset offsets - for (int i = dofOffsetCount - 1; i > 0; --i) { - transposeDofOffsets_h[i] = transposeDofOffsets_h[i - 1]; - } - transposeDofOffsets_h[0] = 0; - - // Copy to device - ::occa::device device = getDevice(); - - transposeQuadIndices = device.malloc(nodeCount, transposeQuadIndices_h); - transposeDofOffsets = device.malloc(dofOffsetCount, transposeDofOffsets_h); - transposeDofIndices = device.malloc(elementEntryCount, transposeDofIndices_h); - - // Clean up temporary arrays - delete[] indexIsUsed; - delete[] quadIndexToDofOffset; - delete[] transposeQuadIndices_h; - delete[] transposeDofOffsets_h; - delete[] transposeDofIndices_h; -} - -void ElemRestriction::setKernelProperties() { - kernelProperties["defines/CeedInt"] = ::occa::dtype::get().name(); - kernelProperties["defines/CeedScalar"] = ::occa::dtype::get().name(); - kernelProperties["defines/COMPONENT_COUNT"] = ceedComponentCount; - kernelProperties["defines/ELEMENT_SIZE"] = ceedElementSize; - kernelProperties["defines/TILE_SIZE"] = 64; - kernelProperties["defines/USES_INDICES"] = usesIndices(); - kernelProperties["defines/USER_STRIDES"] = StrideType::USER_STRIDES; - kernelProperties["defines/NOT_STRIDED"] = StrideType::NOT_STRIDED; - kernelProperties["defines/BACKEND_STRIDES"] = StrideType::BACKEND_STRIDES; - kernelProperties["defines/STRIDE_TYPE"] = ceedStrideType; - kernelProperties["defines/NODE_COUNT"] = transposeQuadIndices.length(); - kernelProperties["defines/NODE_STRIDE"] = ceedNodeStride; - kernelProperties["defines/COMPONENT_STRIDE"] = ceedComponentStride; - kernelProperties["defines/ELEMENT_STRIDE"] = ceedElementStride; - kernelProperties["defines/UNSTRIDED_COMPONENT_STRIDE"] = ceedUnstridedComponentStride; -} - -ElemRestriction *ElemRestriction::getElemRestriction(CeedElemRestriction r, const bool assertValid) { - if (!r || r == CEED_ELEMRESTRICTION_NONE) { - return NULL; - } - - int ierr; - ElemRestriction *elemRestriction = NULL; - - ierr = CeedElemRestrictionGetData(r, (void **)&elemRestriction); - if (assertValid) { - CeedOccaFromChk(ierr); - } - - return elemRestriction; -} - -ElemRestriction *ElemRestriction::from(CeedElemRestriction r) { - ElemRestriction *elemRestriction = getElemRestriction(r); - if (!elemRestriction) { - return NULL; - } - - CeedCallOcca(CeedElemRestrictionGetCeed(r, &elemRestriction->ceed)); - - return elemRestriction->setupFrom(r); -} - -ElemRestriction *ElemRestriction::from(CeedOperatorField operatorField) { - CeedElemRestriction ceedElemRestriction; - - CeedCallOcca(CeedOperatorFieldGetElemRestriction(operatorField, &ceedElemRestriction)); - - return from(ceedElemRestriction); -} - -ElemRestriction *ElemRestriction::setupFrom(CeedElemRestriction r) { - CeedCallOcca(CeedElemRestrictionGetNumElements(r, &ceedElementCount)); - - CeedCallOcca(CeedElemRestrictionGetElementSize(r, &ceedElementSize)); - - CeedCallOcca(CeedElemRestrictionGetNumComponents(r, &ceedComponentCount)); - - CeedCallOcca(CeedElemRestrictionGetLVectorSize(r, &ceedLVectorSize)); - - // Find what type of striding the restriction uses - bool isStrided = false; - bool hasBackendStrides = false; - - CeedCallOcca(CeedElemRestrictionIsStrided(r, &isStrided)); - - if (isStrided) { - CeedCallOcca(CeedElemRestrictionHasBackendStrides(r, &hasBackendStrides)); - } - - if (isStrided) { - if (hasBackendStrides) { - ceedStrideType = BACKEND_STRIDES; - } else { - ceedStrideType = USER_STRIDES; - } - } else { - ceedStrideType = NOT_STRIDED; - } - - // Default strides - ceedNodeStride = 1; - ceedComponentStride = ceedElementSize; - ceedElementStride = ceedElementSize * ceedComponentCount; - ceedUnstridedComponentStride = 1; - - if (ceedStrideType == USER_STRIDES) { - CeedInt strides[3]; - - CeedCallOcca(CeedElemRestrictionGetStrides(r, strides)); - - ceedNodeStride = strides[0]; - ceedComponentStride = strides[1]; - ceedElementStride = strides[2]; - - } else if (ceedStrideType == NOT_STRIDED) { - CeedCallOcca(CeedElemRestrictionGetCompStride(r, &ceedUnstridedComponentStride)); - } - - return this; -} - -int ElemRestriction::apply(CeedTransposeMode rTransposeMode, Vector &u, Vector &v) { - const bool rIsTransposed = (rTransposeMode != CEED_NOTRANSPOSE); - - // Todo: refactor - if (rIsTransposed) { - if (!restrictionTransposeKernel.isInitialized()) { - setKernelProperties(); - restrictionTransposeKernel = getDevice().buildKernelFromString(occa_elem_restriction_source, "applyRestrictionTranspose", kernelProperties); - } - restrictionTransposeKernel(ceedElementCount, transposeQuadIndices, transposeDofOffsets, transposeDofIndices, u.getConstKernelArg(), - v.getKernelArg()); - } else { - if (!restrictionKernel.isInitialized()) { - setKernelProperties(); - restrictionKernel = getDevice().buildKernelFromString(occa_elem_restriction_source, "applyRestriction", kernelProperties); - } - restrictionKernel(ceedElementCount, indices, u.getConstKernelArg(), v.getKernelArg()); - } - return CEED_ERROR_SUCCESS; -} - -int ElemRestriction::getOffsets(CeedMemType memType, const CeedInt **offsets) { - switch (memType) { - case CEED_MEM_HOST: { - *offsets = hostIndices; - return CEED_ERROR_SUCCESS; - } - case CEED_MEM_DEVICE: { - *offsets = memoryToArray(indices); - return CEED_ERROR_SUCCESS; - } - } - return ceedError("Unsupported CeedMemType passed to ElemRestriction::getOffsets"); -} - -//---[ Ceed Callbacks ]----------- -int ElemRestriction::registerCeedFunction(Ceed ceed, CeedElemRestriction r, const char *fname, ceed::occa::ceedFunction f) { - return CeedSetBackendFunction(ceed, "ElemRestriction", r, fname, f); -} - -int ElemRestriction::ceedCreate(CeedMemType memType, CeedCopyMode copyMode, const CeedInt *indicesInput, const bool *orientsInput, - const CeedInt8 *curlOrientsInput, CeedElemRestriction r) { - Ceed ceed; - CeedCallBackend(CeedElemRestrictionGetCeed(r, &ceed)); - - if ((memType != CEED_MEM_DEVICE) && (memType != CEED_MEM_HOST)) { - return staticCeedError("Only HOST and DEVICE CeedMemType supported"); - } - - CeedRestrictionType rstr_type; - CeedCallBackend(CeedElemRestrictionGetType(r, &rstr_type)); - if ((rstr_type == CEED_RESTRICTION_ORIENTED) || (rstr_type == CEED_RESTRICTION_CURL_ORIENTED)) { - return staticCeedError("(OCCA) Backend does not implement CeedElemRestrictionCreateOriented or CeedElemRestrictionCreateCurlOriented"); - } - - ElemRestriction *elemRestriction = new ElemRestriction(); - CeedCallBackend(CeedElemRestrictionSetData(r, elemRestriction)); - - // Setup Ceed objects before setting up memory - elemRestriction = ElemRestriction::from(r); - elemRestriction->setup(memType, copyMode, indicesInput); - - CeedInt defaultLayout[3] = {1, elemRestriction->ceedElementSize, elemRestriction->ceedElementSize * elemRestriction->ceedComponentCount}; - CeedCallBackend(CeedElemRestrictionSetELayout(r, defaultLayout)); - - CeedOccaRegisterFunction(r, "Apply", ElemRestriction::ceedApply); - CeedOccaRegisterFunction(r, "ApplyUnsigned", ElemRestriction::ceedApply); - CeedOccaRegisterFunction(r, "ApplyUnoriented", ElemRestriction::ceedApply); - CeedOccaRegisterFunction(r, "ApplyBlock", ElemRestriction::ceedApplyBlock); - CeedOccaRegisterFunction(r, "GetOffsets", ElemRestriction::ceedGetOffsets); - CeedOccaRegisterFunction(r, "Destroy", ElemRestriction::ceedDestroy); - - return CEED_ERROR_SUCCESS; -} - -int ElemRestriction::ceedApply(CeedElemRestriction r, CeedTransposeMode tmode, CeedVector u, CeedVector v, CeedRequest *request) { - ElemRestriction *elemRestriction = ElemRestriction::from(r); - Vector *uVector = Vector::from(u); - Vector *vVector = Vector::from(v); - - if (!elemRestriction) { - return staticCeedError("Incorrect CeedElemRestriction argument: r"); - } - if (!uVector) { - return elemRestriction->ceedError("Incorrect CeedVector argument: u"); - } - if (!vVector) { - return elemRestriction->ceedError("Incorrect CeedVector argument: v"); - } - - return elemRestriction->apply(tmode, *uVector, *vVector); -} - -int ElemRestriction::ceedApplyBlock(CeedElemRestriction r, CeedInt block, CeedTransposeMode tmode, CeedVector u, CeedVector v, CeedRequest *request) { - return staticCeedError("(OCCA) Backend does not implement CeedElemRestrictionApplyBlock"); -} - -int ElemRestriction::ceedGetOffsets(CeedElemRestriction r, CeedMemType memType, const CeedInt **offsets) { - ElemRestriction *elemRestriction = ElemRestriction::from(r); - - if (!elemRestriction) { - return staticCeedError("Incorrect CeedElemRestriction argument: r"); - } - - return elemRestriction->getOffsets(memType, offsets); -} - -int ElemRestriction::ceedDestroy(CeedElemRestriction r) { - delete getElemRestriction(r, false); - return CEED_ERROR_SUCCESS; -} -} // namespace occa -} // namespace ceed diff --git a/backends/occa/ceed-occa-elem-restriction.hpp b/backends/occa/ceed-occa-elem-restriction.hpp deleted file mode 100644 index 7ac03146b8..0000000000 --- a/backends/occa/ceed-occa-elem-restriction.hpp +++ /dev/null @@ -1,94 +0,0 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. -// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. -// -// SPDX-License-Identifier: BSD-2-Clause -// -// This file is part of CEED: http://github.com/ceed - -#ifndef CEED_OCCA_ELEMRESTRICTION_HEADER -#define CEED_OCCA_ELEMRESTRICTION_HEADER - -#include "ceed-occa-ceed-object.hpp" -#include "ceed-occa-vector.hpp" - -namespace ceed { -namespace occa { -enum StrideType { - BACKEND_STRIDES = 0, - USER_STRIDES = 1, - NOT_STRIDED = 2, -}; - -class ElemRestriction : public CeedObject { - public: - // Ceed object information - CeedInt ceedElementCount; - CeedInt ceedElementSize; - CeedInt ceedComponentCount; - CeedSize ceedLVectorSize; - StrideType ceedStrideType; - CeedInt ceedNodeStride; - CeedInt ceedComponentStride; - CeedInt ceedElementStride; - CeedInt ceedUnstridedComponentStride; - - // Passed resources - bool freeHostIndices; - CeedInt *hostIndices; - - // Owned resources - bool freeIndices; - ::occa::memory indices; - - ::occa::memory transposeQuadIndices; - ::occa::memory transposeDofOffsets; - ::occa::memory transposeDofIndices; - - ::occa::json kernelProperties; - ::occa::kernel restrictionKernel; - ::occa::kernel restrictionTransposeKernel; - - ElemRestriction(); - - ~ElemRestriction(); - - void setup(CeedMemType memType, CeedCopyMode copyMode, const CeedInt *indicesInput); - - void setupFromHostMemory(CeedCopyMode copyMode, const CeedInt *indices_h); - - void setupFromDeviceMemory(CeedCopyMode copyMode, const CeedInt *indices_d); - - bool usesIndices(); - - void setupTransposeIndices(); - - void setKernelProperties(); - - static ElemRestriction *getElemRestriction(CeedElemRestriction r, const bool assertValid = true); - - static ElemRestriction *from(CeedElemRestriction r); - static ElemRestriction *from(CeedOperatorField operatorField); - ElemRestriction *setupFrom(CeedElemRestriction r); - - int apply(CeedTransposeMode rTransposeMode, Vector &u, Vector &v); - - int getOffsets(CeedMemType memType, const CeedInt **offsets); - - //---[ Ceed Callbacks ]----------- - static int registerCeedFunction(Ceed ceed, CeedElemRestriction r, const char *fname, ceed::occa::ceedFunction f); - - static int ceedCreate(CeedMemType memType, CeedCopyMode copyMode, const CeedInt *indicesInput, const bool *orientsInput, - const CeedInt8 *curlOrientsInput, CeedElemRestriction r); - - static int ceedApply(CeedElemRestriction r, CeedTransposeMode tmode, CeedVector u, CeedVector v, CeedRequest *request); - - static int ceedGetOffsets(CeedElemRestriction r, CeedMemType memType, const CeedInt **offsets); - - static int ceedApplyBlock(CeedElemRestriction r, CeedInt block, CeedTransposeMode tmode, CeedVector u, CeedVector v, CeedRequest *request); - - static int ceedDestroy(CeedElemRestriction r); -}; -} // namespace occa -} // namespace ceed - -#endif diff --git a/backends/occa/ceed-occa-gpu-operator.cpp b/backends/occa/ceed-occa-gpu-operator.cpp deleted file mode 100644 index af7a43becd..0000000000 --- a/backends/occa/ceed-occa-gpu-operator.cpp +++ /dev/null @@ -1,24 +0,0 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. -// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. -// -// SPDX-License-Identifier: BSD-2-Clause -// -// This file is part of CEED: http://github.com/ceed - -#include "ceed-occa-gpu-operator.hpp" - -#include "ceed-occa-qfunction.hpp" - -namespace ceed { -namespace occa { -GpuOperator::GpuOperator() {} - -GpuOperator::~GpuOperator() {} - -::occa::kernel GpuOperator::buildApplyAddKernel() { return ::occa::kernel(); } - -void GpuOperator::applyAdd(Vector *in, Vector *out) { - // TODO: Implement -} -} // namespace occa -} // namespace ceed diff --git a/backends/occa/ceed-occa-gpu-operator.hpp b/backends/occa/ceed-occa-gpu-operator.hpp deleted file mode 100644 index fc14304975..0000000000 --- a/backends/occa/ceed-occa-gpu-operator.hpp +++ /dev/null @@ -1,30 +0,0 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. -// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. -// -// SPDX-License-Identifier: BSD-2-Clause -// -// This file is part of CEED: http://github.com/ceed - -#ifndef CEED_OCCA_GPU_OPERATOR_HEADER -#define CEED_OCCA_GPU_OPERATOR_HEADER - -#include - -#include "ceed-occa-operator.hpp" - -namespace ceed { -namespace occa { -class GpuOperator : public Operator { - public: - GpuOperator(); - - ~GpuOperator(); - - ::occa::kernel buildApplyAddKernel(); - - void applyAdd(Vector *in, Vector *out); -}; -} // namespace occa -} // namespace ceed - -#endif diff --git a/backends/occa/ceed-occa-kernels.hpp b/backends/occa/ceed-occa-kernels.hpp deleted file mode 100644 index 86469be1f1..0000000000 --- a/backends/occa/ceed-occa-kernels.hpp +++ /dev/null @@ -1,16 +0,0 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. -// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. -// -// SPDX-License-Identifier: BSD-2-Clause -// -// This file is part of CEED: http://github.com/ceed - -#ifndef CEED_OCCA_KERNELS_HEADER -#define CEED_OCCA_KERNELS_HEADER - -#include "./kernels/elem-restriction.hpp" -#include "./kernels/set-value.hpp" -#include "./kernels/simplex-basis.hpp" -#include "./kernels/tensor-basis.hpp" - -#endif diff --git a/backends/occa/ceed-occa-operator-args.cpp b/backends/occa/ceed-occa-operator-args.cpp deleted file mode 100644 index 61199ce288..0000000000 --- a/backends/occa/ceed-occa-operator-args.cpp +++ /dev/null @@ -1,48 +0,0 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. -// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. -// -// SPDX-License-Identifier: BSD-2-Clause -// -// This file is part of CEED: http://github.com/ceed - -#include "ceed-occa-operator-args.hpp" - -namespace ceed { -namespace occa { -OperatorArgs::OperatorArgs() : QFunctionArgs() {} - -OperatorArgs::OperatorArgs(CeedOperator op) : QFunctionArgs() { setupArgs(op); } - -void OperatorArgs::setupArgs(CeedOperator op) { - CeedQFunction qf; - CeedOperatorField *ceedInputFields, *ceedOutputFields; - - CeedCallOccaValid(_isValid, CeedOperatorGetQFunction(op, &qf)); - setupQFunctionArgs(qf); - - if (!_isValid) { - return; - } - - CeedCallOccaValid(_isValid, CeedOperatorGetFields(op, NULL, &ceedInputFields, NULL, &ceedOutputFields)); - - for (int i = 0; i < _inputCount; ++i) { - OperatorField field = OperatorField(ceedInputFields[i]); - opInputs.push_back(field); - _isValid &= field.isValid(); - } - - for (int i = 0; i < _outputCount; ++i) { - OperatorField field = OperatorField(ceedOutputFields[i]); - opOutputs.push_back(field); - _isValid &= field.isValid(); - } -} - -const OperatorField &OperatorArgs::getOpField(const bool isInput, const int index) const { return isInput ? opInputs[index] : opOutputs[index]; } - -const OperatorField &OperatorArgs::getOpInput(const int index) const { return opInputs[index]; } - -const OperatorField &OperatorArgs::getOpOutput(const int index) const { return opOutputs[index]; } -} // namespace occa -} // namespace ceed diff --git a/backends/occa/ceed-occa-operator-args.hpp b/backends/occa/ceed-occa-operator-args.hpp deleted file mode 100644 index 5edf95188c..0000000000 --- a/backends/occa/ceed-occa-operator-args.hpp +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. -// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. -// -// SPDX-License-Identifier: BSD-2-Clause -// -// This file is part of CEED: http://github.com/ceed - -#ifndef CEED_OCCA_OPERATORARGS_HEADER -#define CEED_OCCA_OPERATORARGS_HEADER - -#include - -#include "ceed-occa-ceed-object.hpp" -#include "ceed-occa-operator-field.hpp" -#include "ceed-occa-qfunction-args.hpp" - -namespace ceed { -namespace occa { -typedef std::vector OperatorFieldVector; - -class OperatorArgs : public QFunctionArgs { - public: - OperatorFieldVector opInputs; - OperatorFieldVector opOutputs; - - OperatorArgs(); - OperatorArgs(CeedOperator op); - - void setupArgs(CeedOperator op); - - const OperatorField &getOpField(const bool isInput, const int index) const; - - const OperatorField &getOpInput(const int index) const; - - const OperatorField &getOpOutput(const int index) const; -}; -} // namespace occa -} // namespace ceed - -#endif diff --git a/backends/occa/ceed-occa-operator-field.cpp b/backends/occa/ceed-occa-operator-field.cpp deleted file mode 100644 index 6716d11e06..0000000000 --- a/backends/occa/ceed-occa-operator-field.cpp +++ /dev/null @@ -1,61 +0,0 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. -// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. -// -// SPDX-License-Identifier: BSD-2-Clause -// -// This file is part of CEED: http://github.com/ceed - -#include "ceed-occa-operator-field.hpp" - -#include "ceed-occa-basis.hpp" -#include "ceed-occa-elem-restriction.hpp" -#include "ceed-occa-vector.hpp" - -namespace ceed { -namespace occa { -OperatorField::OperatorField(CeedOperatorField opField) : _isValid(false), _usesActiveVector(false), vec(NULL), basis(NULL), elemRestriction(NULL) { - CeedBasis ceedBasis; - CeedVector ceedVector; - CeedElemRestriction ceedElemRestriction; - - CeedCallOccaValid(_isValid, CeedOperatorFieldGetBasis(opField, &ceedBasis)); - - CeedCallOccaValid(_isValid, CeedOperatorFieldGetVector(opField, &ceedVector)); - - CeedCallOccaValid(_isValid, CeedOperatorFieldGetElemRestriction(opField, &ceedElemRestriction)); - - _isValid = true; - _usesActiveVector = ceedVector == CEED_VECTOR_ACTIVE; - - vec = Vector::from(ceedVector); - basis = Basis::from(ceedBasis); - elemRestriction = ElemRestriction::from(ceedElemRestriction); -} - -bool OperatorField::isValid() const { return _isValid; } - -//---[ Vector Info ]---------------- -bool OperatorField::usesActiveVector() const { return _usesActiveVector; } -//================================== - -//---[ Basis Info ]----------------- -bool OperatorField::hasBasis() const { return basis; } - -int OperatorField::usingTensorBasis() const { return basis->isTensorBasis(); } - -int OperatorField::getComponentCount() const { return (basis ? basis->ceedComponentCount : 1); } - -int OperatorField::getP() const { return (basis ? basis->P : 0); } - -int OperatorField::getQ() const { return (basis ? basis->Q : 0); } - -int OperatorField::getDim() const { return (basis ? basis->dim : 1); } -//================================== - -//---[ ElemRestriction Info ]------- -int OperatorField::getElementCount() const { return (elemRestriction ? elemRestriction->ceedElementCount : 1); } - -int OperatorField::getElementSize() const { return (elemRestriction ? elemRestriction->ceedElementSize : 1); } -//================================== -} // namespace occa -} // namespace ceed diff --git a/backends/occa/ceed-occa-operator-field.hpp b/backends/occa/ceed-occa-operator-field.hpp deleted file mode 100644 index 4eeb5e70ed..0000000000 --- a/backends/occa/ceed-occa-operator-field.hpp +++ /dev/null @@ -1,55 +0,0 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. -// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. -// -// SPDX-License-Identifier: BSD-2-Clause -// -// This file is part of CEED: http://github.com/ceed - -#ifndef CEED_OCCA_OPERATORFIELD_HEADER -#define CEED_OCCA_OPERATORFIELD_HEADER - -#include "ceed-occa-context.hpp" - -namespace ceed { -namespace occa { -class Basis; -class ElemRestriction; -class Vector; - -class OperatorField { - private: - bool _isValid; - bool _usesActiveVector; - - public: - Vector *vec; - Basis *basis; - ElemRestriction *elemRestriction; - - OperatorField(CeedOperatorField opField); - - bool isValid() const; - - //---[ Vector Info ]-------------- - bool usesActiveVector() const; - //================================ - - //---[ Basis Info ]--------------- - bool hasBasis() const; - int usingTensorBasis() const; - - int getComponentCount() const; - int getP() const; - int getQ() const; - int getDim() const; - //================================ - - //---[ ElemRestriction Info ]----- - int getElementCount() const; - int getElementSize() const; - //================================ -}; -} // namespace occa -} // namespace ceed - -#endif diff --git a/backends/occa/ceed-occa-operator.cpp b/backends/occa/ceed-occa-operator.cpp deleted file mode 100644 index c19e875033..0000000000 --- a/backends/occa/ceed-occa-operator.cpp +++ /dev/null @@ -1,151 +0,0 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. -// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. -// -// SPDX-License-Identifier: BSD-2-Clause -// -// This file is part of CEED: http://github.com/ceed - -#include "ceed-occa-operator.hpp" - -#include "ceed-occa-basis.hpp" -#include "ceed-occa-cpu-operator.hpp" -#include "ceed-occa-elem-restriction.hpp" -#include "ceed-occa-gpu-operator.hpp" -#include "ceed-occa-qfunction.hpp" - -namespace ceed { -namespace occa { -Operator::Operator() : ceedQ(0), ceedElementCount(0), qfunction(NULL), needsInitialSetup(true) {} - -Operator::~Operator() {} - -Operator *Operator::getOperator(CeedOperator op, const bool assertValid) { - if (!op) { - return NULL; - } - - int ierr; - Operator *operator_ = NULL; - - ierr = CeedOperatorGetData(op, (void **)&operator_); - if (assertValid) { - CeedOccaFromChk(ierr); - } - - return operator_; -} - -Operator *Operator::from(CeedOperator op) { - Operator *operator_ = getOperator(op); - if (!operator_) { - return NULL; - } - - CeedCallOcca(CeedOperatorGetCeed(op, &operator_->ceed)); - - operator_->qfunction = QFunction::from(op); - if (!operator_->qfunction) { - return NULL; - } - - CeedCallOcca(CeedOperatorGetNumQuadraturePoints(op, &operator_->ceedQ)); - CeedCallOcca(CeedOperatorGetNumElements(op, &operator_->ceedElementCount)); - - operator_->args.setupArgs(op); - if (!operator_->args.isValid()) { - return NULL; - } - - return operator_; -} - -bool Operator::isApplyingIdentityFunction() { return qfunction->ceedIsIdentity; } - -int Operator::applyAdd(Vector *in, Vector *out, CeedRequest *request) { - // TODO: Cache kernel objects rather than relying on OCCA kernel caching - applyAddKernel = buildApplyAddKernel(); - - if (needsInitialSetup) { - initialSetup(); - needsInitialSetup = false; - } - - applyAdd(in, out); - - return CEED_ERROR_SUCCESS; -} - -//---[ Virtual Methods ]------------ -void Operator::initialSetup() {} - -//---[ Ceed Callbacks ]------------- -int Operator::registerCeedFunction(Ceed ceed, CeedOperator op, const char *fname, ceed::occa::ceedFunction f) { - return CeedSetBackendFunction(ceed, "Operator", op, fname, f); -} - -int Operator::ceedCreate(CeedOperator op) { - Ceed ceed; - CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); - -#if 1 - Operator *operator_ = new CpuOperator(); -#else - // TODO: Add GPU specific operator - Operator *operator_ = (Context::from(ceed)->usingCpuDevice() ? ((Operator *)new CpuOperator()) : ((Operator *)new GpuOperator())); -#endif - - CeedCallBackend(CeedOperatorSetData(op, operator_)); - - CeedOccaRegisterFunction(op, "LinearAssembleQFunction", Operator::ceedLinearAssembleQFunction); - CeedOccaRegisterFunction(op, "LinearAssembleQFunctionUpdate", Operator::ceedLinearAssembleQFunction); - CeedOccaRegisterFunction(op, "LinearAssembleAddDiagonal", Operator::ceedLinearAssembleAddDiagonal); - CeedOccaRegisterFunction(op, "LinearAssembleAddPointBlockDiagonal", Operator::ceedLinearAssembleAddPointBlockDiagonal); - CeedOccaRegisterFunction(op, "CreateFDMElementInverse", Operator::ceedCreateFDMElementInverse); - CeedOccaRegisterFunction(op, "ApplyAdd", Operator::ceedApplyAdd); - CeedOccaRegisterFunction(op, "Destroy", Operator::ceedDestroy); - - return CEED_ERROR_SUCCESS; -} - -int Operator::ceedCreateComposite(CeedOperator op) { - Ceed ceed; - CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); - - CeedOccaRegisterFunction(op, "LinearAssembleAddDiagonal", Operator::ceedLinearAssembleAddDiagonal); - CeedOccaRegisterFunction(op, "LinearAssembleAddPointBlockDiagonal", Operator::ceedLinearAssembleAddPointBlockDiagonal); - - return CEED_ERROR_SUCCESS; -} - -int Operator::ceedLinearAssembleQFunction(CeedOperator op) { return staticCeedError("(OCCA) Backend does not implement LinearAssembleQFunction"); } - -int Operator::ceedLinearAssembleQFunctionUpdate(CeedOperator op) { - return staticCeedError("(OCCA) Backend does not implement LinearAssembleQFunctionUpdate"); -} - -int Operator::ceedLinearAssembleAddDiagonal(CeedOperator op) { return staticCeedError("(OCCA) Backend does not implement LinearAssembleDiagonal"); } - -int Operator::ceedLinearAssembleAddPointBlockDiagonal(CeedOperator op) { - return staticCeedError("(OCCA) Backend does not implement LinearAssemblePointBlockDiagonal"); -} - -int Operator::ceedCreateFDMElementInverse(CeedOperator op) { return staticCeedError("(OCCA) Backend does not implement CreateFDMElementInverse"); } - -int Operator::ceedApplyAdd(CeedOperator op, CeedVector invec, CeedVector outvec, CeedRequest *request) { - Operator *operator_ = Operator::from(op); - Vector *in = Vector::from(invec); - Vector *out = Vector::from(outvec); - - if (!operator_) { - return staticCeedError("Incorrect CeedOperator argument: op"); - } - - return operator_->applyAdd(in, out, request); -} - -int Operator::ceedDestroy(CeedOperator op) { - delete getOperator(op, false); - return CEED_ERROR_SUCCESS; -} -} // namespace occa -} // namespace ceed diff --git a/backends/occa/ceed-occa-operator.hpp b/backends/occa/ceed-occa-operator.hpp deleted file mode 100644 index 5325bdf33d..0000000000 --- a/backends/occa/ceed-occa-operator.hpp +++ /dev/null @@ -1,74 +0,0 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. -// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. -// -// SPDX-License-Identifier: BSD-2-Clause -// -// This file is part of CEED: http://github.com/ceed - -#ifndef CEED_OCCA_OPERATOR_HEADER -#define CEED_OCCA_OPERATOR_HEADER - -#include - -#include "ceed-occa-ceed-object.hpp" -#include "ceed-occa-operator-args.hpp" - -namespace ceed { -namespace occa { -typedef std::vector VectorVector_t; - -class QFunction; - -class Operator : public CeedObject { - public: - // Ceed object information - CeedInt ceedQ; - CeedInt ceedElementCount; - - // Owned resources - QFunction *qfunction; - OperatorArgs args; - ::occa::kernel applyAddKernel; - bool needsInitialSetup; - - // Reference to other memory - ::occa::memory qFunctionContextData; - - Operator(); - virtual ~Operator(); - - static Operator *getOperator(CeedOperator op, const bool assertValid = true); - - static Operator *from(CeedOperator op); - - bool isApplyingIdentityFunction(); - - int applyAdd(Vector *in, Vector *out, CeedRequest *request); - - //---[ Virtual Methods ]---------- - virtual ::occa::kernel buildApplyAddKernel() = 0; - - virtual void initialSetup(); - - virtual void applyAdd(Vector *in, Vector *out) = 0; - - //---[ Ceed Callbacks ]----------- - static int registerCeedFunction(Ceed ceed, CeedOperator op, const char *fname, ceed::occa::ceedFunction f); - - static int ceedCreate(CeedOperator op); - static int ceedCreateComposite(CeedOperator op); - - static int ceedLinearAssembleQFunction(CeedOperator op); - static int ceedLinearAssembleQFunctionUpdate(CeedOperator op); - static int ceedLinearAssembleAddDiagonal(CeedOperator op); - static int ceedLinearAssembleAddPointBlockDiagonal(CeedOperator op); - static int ceedCreateFDMElementInverse(CeedOperator op); - - static int ceedApplyAdd(CeedOperator op, CeedVector invec, CeedVector outvec, CeedRequest *request); - - static int ceedDestroy(CeedOperator op); -}; -} // namespace occa -} // namespace ceed - -#endif diff --git a/backends/occa/ceed-occa-qfunction-args.cpp b/backends/occa/ceed-occa-qfunction-args.cpp deleted file mode 100644 index b8d2d9e936..0000000000 --- a/backends/occa/ceed-occa-qfunction-args.cpp +++ /dev/null @@ -1,60 +0,0 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. -// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. -// -// SPDX-License-Identifier: BSD-2-Clause -// -// This file is part of CEED: http://github.com/ceed - -#include "ceed-occa-qfunction-args.hpp" - -namespace ceed { -namespace occa { -QFunctionArgs::QFunctionArgs() : _isValid(false), _inputCount(0), _outputCount(0) {} - -QFunctionArgs::QFunctionArgs(CeedQFunction qf) : _isValid(false), _inputCount(0), _outputCount(0) { setupQFunctionArgs(qf); } - -void QFunctionArgs::setupQFunctionArgs(CeedQFunction qf) { - CeedQFunctionField *ceedInputFields, *ceedOutputFields; - - CeedCallOccaValid(_isValid, CeedQFunctionGetCeed(qf, &ceed)); - - CeedCallOccaValid(_isValid, CeedQFunctionGetNumArgs(qf, &_inputCount, &_outputCount)); - - CeedCallOccaValid(_isValid, CeedQFunctionGetFields(qf, NULL, &ceedInputFields, NULL, &ceedOutputFields)); - - _isValid = true; - - for (int i = 0; i < _inputCount; ++i) { - QFunctionField field = QFunctionField(ceedInputFields[i]); - qfInputs.push_back(field); - _isValid &= field.isValid(); - } - - for (int i = 0; i < _outputCount; ++i) { - QFunctionField field = QFunctionField(ceedOutputFields[i]); - qfOutputs.push_back(field); - _isValid &= field.isValid(); - } -} - -bool QFunctionArgs::isValid() const { return _isValid; } - -int QFunctionArgs::inputCount() const { return _inputCount; } - -int QFunctionArgs::outputCount() const { return _outputCount; } - -const QFunctionField &QFunctionArgs::getQfField(const bool isInput, const int index) const { return isInput ? qfInputs[index] : qfOutputs[index]; } - -const QFunctionField &QFunctionArgs::getQfInput(const int index) const { return qfInputs[index]; } - -const QFunctionField &QFunctionArgs::getQfOutput(const int index) const { return qfOutputs[index]; } - -CeedEvalMode QFunctionArgs::getEvalMode(const bool isInput, const int index) const { - return isInput ? qfInputs[index].evalMode : qfOutputs[index].evalMode; -} - -CeedEvalMode QFunctionArgs::getInputEvalMode(const int index) const { return qfInputs[index].evalMode; } - -CeedEvalMode QFunctionArgs::getOutputEvalMode(const int index) const { return qfOutputs[index].evalMode; } -} // namespace occa -} // namespace ceed diff --git a/backends/occa/ceed-occa-qfunction-args.hpp b/backends/occa/ceed-occa-qfunction-args.hpp deleted file mode 100644 index 77093ec93d..0000000000 --- a/backends/occa/ceed-occa-qfunction-args.hpp +++ /dev/null @@ -1,55 +0,0 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. -// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. -// -// SPDX-License-Identifier: BSD-2-Clause -// -// This file is part of CEED: http://github.com/ceed - -#ifndef CEED_OCCA_QFUNCTIONARGS_HEADER -#define CEED_OCCA_QFUNCTIONARGS_HEADER - -#include - -#include "ceed-occa-ceed-object.hpp" -#include "ceed-occa-qfunction-field.hpp" - -namespace ceed { -namespace occa { -typedef std::vector QFunctionFieldVector; - -class QFunctionArgs : public CeedObject { - protected: - bool _isValid; - CeedInt _inputCount; - CeedInt _outputCount; - - public: - QFunctionFieldVector qfInputs; - QFunctionFieldVector qfOutputs; - - QFunctionArgs(); - QFunctionArgs(CeedQFunction qf); - - void setupQFunctionArgs(CeedQFunction qf); - - bool isValid() const; - - int inputCount() const; - int outputCount() const; - - const QFunctionField &getQfField(const bool isInput, const int index) const; - - const QFunctionField &getQfInput(const int index) const; - - const QFunctionField &getQfOutput(const int index) const; - - CeedEvalMode getEvalMode(const bool isInput, const int index) const; - - CeedEvalMode getInputEvalMode(const int index) const; - - CeedEvalMode getOutputEvalMode(const int index) const; -}; -} // namespace occa -} // namespace ceed - -#endif diff --git a/backends/occa/ceed-occa-qfunction-field.cpp b/backends/occa/ceed-occa-qfunction-field.cpp deleted file mode 100644 index 7dada84ba8..0000000000 --- a/backends/occa/ceed-occa-qfunction-field.cpp +++ /dev/null @@ -1,22 +0,0 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. -// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. -// -// SPDX-License-Identifier: BSD-2-Clause -// -// This file is part of CEED: http://github.com/ceed - -#include "ceed-occa-qfunction-field.hpp" - -namespace ceed { -namespace occa { -QFunctionField::QFunctionField(CeedQFunctionField qfField) : _isValid(false), size(0) { - CeedCallOccaValid(_isValid, CeedQFunctionFieldGetEvalMode(qfField, &evalMode)); - - CeedCallOccaValid(_isValid, CeedQFunctionFieldGetSize(qfField, &size)); - - _isValid = true; -} - -bool QFunctionField::isValid() const { return _isValid; } -} // namespace occa -} // namespace ceed diff --git a/backends/occa/ceed-occa-qfunction-field.hpp b/backends/occa/ceed-occa-qfunction-field.hpp deleted file mode 100644 index 86eefd690e..0000000000 --- a/backends/occa/ceed-occa-qfunction-field.hpp +++ /dev/null @@ -1,30 +0,0 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. -// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. -// -// SPDX-License-Identifier: BSD-2-Clause -// -// This file is part of CEED: http://github.com/ceed - -#ifndef CEED_OCCA_QFUNCTIONFIELD_HEADER -#define CEED_OCCA_QFUNCTIONFIELD_HEADER - -#include "ceed-occa-context.hpp" - -namespace ceed { -namespace occa { -class QFunctionField { - protected: - bool _isValid; - - public: - CeedEvalMode evalMode; - CeedInt size; - - QFunctionField(CeedQFunctionField qfField); - - bool isValid() const; -}; -} // namespace occa -} // namespace ceed - -#endif diff --git a/backends/occa/ceed-occa-qfunction.cpp b/backends/occa/ceed-occa-qfunction.cpp deleted file mode 100644 index ac8e3b7386..0000000000 --- a/backends/occa/ceed-occa-qfunction.cpp +++ /dev/null @@ -1,242 +0,0 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. -// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. -// -// SPDX-License-Identifier: BSD-2-Clause -// -// This file is part of CEED: http://github.com/ceed - -#include "ceed-occa-qfunction.hpp" - -#include -#include - -#include "ceed-occa-qfunctioncontext.hpp" -#include "ceed-occa-vector.hpp" - -namespace ceed { -namespace occa { -QFunction::QFunction(const std::string &source, const std::string &function_name) : ceedIsIdentity(false) { - filename = source; - qFunctionName = function_name; -} - -QFunction *QFunction::getQFunction(CeedQFunction qf, const bool assertValid) { - if (!qf) { - return NULL; - } - - QFunction *qFunction = NULL; - - CeedCallOcca(CeedQFunctionGetData(qf, &qFunction)); - - return qFunction; -} - -QFunction *QFunction::from(CeedQFunction qf) { - QFunction *qFunction = getQFunction(qf); - if (!qFunction) { - return NULL; - } - - CeedCallOcca(CeedQFunctionGetCeed(qf, &qFunction->ceed)); - - CeedCallOcca(CeedQFunctionGetInnerContext(qf, &qFunction->qFunctionContext)); - - CeedCallOcca(CeedQFunctionIsIdentity(qf, &qFunction->ceedIsIdentity)); - - qFunction->args.setupQFunctionArgs(qf); - if (!qFunction->args.isValid()) { - return NULL; - } - - return qFunction; -} - -QFunction *QFunction::from(CeedOperator op) { - if (!op) { - return NULL; - } - - CeedQFunction qf; - - CeedCallOcca(CeedOperatorGetQFunction(op, &qf)); - - return QFunction::from(qf); -} - -::occa::properties QFunction::getKernelProps(const CeedInt Q) { - ::occa::properties props; - - // Types - props["defines/CeedInt"] = ::occa::dtype::get().name(); - props["defines/CeedScalar"] = ::occa::dtype::get().name(); - - // CEED defines - props["defines/CeedPragmaSIMD"] = ""; - props["defines/CEED_Q_VLA"] = "OCCA_Q"; - props["defines/CEED_ERROR_SUCCESS"] = 0; - - std::stringstream ss; - ss << "#define CEED_QFUNCTION(FUNC_NAME) \\" << std::endl - << " inline int FUNC_NAME" << std::endl - << "#define CEED_QFUNCTION_HELPER \\" << std::endl - << " inline" << std::endl - << std::endl - << "#include \"" << filename << "\"" << std::endl; - - props["headers"].asArray() += ss.str(); - - return props; -} - -int QFunction::buildKernel(const CeedInt Q) { - // TODO: Store a kernel per Q - if (!qFunctionKernel.isInitialized()) { - ::occa::properties props = getKernelProps(Q); - - // Properties only used in the QFunction kernel source - props["defines/OCCA_Q"] = Q; - - const std::string kernelName = "qf_" + qFunctionName; - - qFunctionKernel = (getDevice().buildKernelFromString(getKernelSource(kernelName, Q), kernelName, props)); - } - - return CEED_ERROR_SUCCESS; -} - -std::string QFunction::getKernelSource(const std::string &kernelName, const CeedInt Q) { - std::stringstream ss; - - ss << "@kernel" << std::endl << "void " << kernelName << "(" << std::endl; - - // qfunction arguments - for (int i = 0; i < args.inputCount(); ++i) { - ss << " const CeedScalar *in" << i << ',' << std::endl; - } - for (int i = 0; i < args.outputCount(); ++i) { - ss << " CeedScalar *out" << i << ',' << std::endl; - } - ss << " void *ctx" << std::endl; - ss << ") {" << std::endl; - - // Iterate over Q and call qfunction - ss << " @tile(128, @outer, @inner)" << std::endl - << " for (int q = 0; q < OCCA_Q; ++q) {" << std::endl - << " const CeedScalar* in[" << std::max(1, args.inputCount()) << "];" << std::endl - << " CeedScalar* out[" << std::max(1, args.outputCount()) << "];" << std::endl; - - // Set and define in for the q point - for (int i = 0; i < args.inputCount(); ++i) { - const CeedInt fieldSize = args.getQfInput(i).size; - const std::string qIn_i = "qIn" + std::to_string(i); - const std::string in_i = "in" + std::to_string(i); - - ss << " CeedScalar " << qIn_i << "[" << fieldSize << "];" << std::endl - << " in[" << i << "] = " << qIn_i << ";" - << std::endl - // Copy q data - << " for (int qi = 0; qi < " << fieldSize << "; ++qi) {" << std::endl - << " " << qIn_i << "[qi] = " << in_i << "[q + (OCCA_Q * qi)];" << std::endl - << " }" << std::endl; - } - - // Set out for the q point - for (int i = 0; i < args.outputCount(); ++i) { - const CeedInt fieldSize = args.getQfOutput(i).size; - const std::string qOut_i = "qOut" + std::to_string(i); - - ss << " CeedScalar " << qOut_i << "[" << fieldSize << "];" << std::endl << " out[" << i << "] = " << qOut_i << ";" << std::endl; - } - - ss << " " << qFunctionName << "(ctx, 1, in, out);" << std::endl; - - // Copy out for the q point - for (int i = 0; i < args.outputCount(); ++i) { - const CeedInt fieldSize = args.getQfOutput(i).size; - const std::string qOut_i = "qOut" + std::to_string(i); - const std::string out_i = "out" + std::to_string(i); - - ss << " for (int qi = 0; qi < " << fieldSize << "; ++qi) {" << std::endl - << " " << out_i << "[q + (OCCA_Q * qi)] = " << qOut_i << "[qi];" << std::endl - << " }" << std::endl; - } - - ss << " }" << std::endl << "}"; - - return ss.str(); -} - -int QFunction::apply(CeedInt Q, CeedVector *U, CeedVector *V) { - CeedCallBackend(buildKernel(Q)); - - std::vector outputArgs; - - qFunctionKernel.clearArgs(); - - for (CeedInt i = 0; i < args.inputCount(); i++) { - Vector *u = Vector::from(U[i]); - if (!u) { - return ceedError("Incorrect qFunction input field: U[" + std::to_string(i) + "]"); - } - qFunctionKernel.pushArg(u->getConstKernelArg()); - } - - for (CeedInt i = 0; i < args.outputCount(); i++) { - Vector *v = Vector::from(V[i]); - if (!v) { - return ceedError("Incorrect qFunction output field: V[" + std::to_string(i) + "]"); - } - qFunctionKernel.pushArg(v->getKernelArg()); - } - if (qFunctionContext) { - QFunctionContext *ctx = QFunctionContext::from(qFunctionContext); - qFunctionKernel.pushArg(ctx->getKernelArg()); - } else { - qFunctionKernel.pushArg(::occa::null); - } - - qFunctionKernel.run(); - - return CEED_ERROR_SUCCESS; -} - -//---[ Ceed Callbacks ]----------- -int QFunction::registerCeedFunction(Ceed ceed, CeedQFunction qf, const char *fname, ceed::occa::ceedFunction f) { - return CeedSetBackendFunction(ceed, "QFunction", qf, fname, f); -} - -int QFunction::ceedCreate(CeedQFunction qf) { - Ceed ceed; - CeedCallBackend(CeedQFunctionGetCeed(qf, &ceed)); - Context *context; - CeedCallBackend(CeedGetData(ceed, &context)); - const char *source; - CeedCallBackend(CeedQFunctionGetSourcePath(qf, &source)); - const char *function_name; - CeedCallBackend(CeedQFunctionGetKernelName(qf, &function_name)); - - QFunction *qFunction = new QFunction(source, function_name); - CeedCallBackend(CeedQFunctionSetData(qf, qFunction)); - - CeedOccaRegisterFunction(qf, "Apply", QFunction::ceedApply); - CeedOccaRegisterFunction(qf, "Destroy", QFunction::ceedDestroy); - - return CEED_ERROR_SUCCESS; -} - -int QFunction::ceedApply(CeedQFunction qf, CeedInt Q, CeedVector *U, CeedVector *V) { - QFunction *qFunction = QFunction::from(qf); - if (qFunction) { - return qFunction->apply(Q, U, V); - } - - return 1; -} - -int QFunction::ceedDestroy(CeedQFunction qf) { - delete getQFunction(qf, false); - return CEED_ERROR_SUCCESS; -} -} // namespace occa -} // namespace ceed diff --git a/backends/occa/ceed-occa-qfunction.hpp b/backends/occa/ceed-occa-qfunction.hpp deleted file mode 100644 index 4af04c5bd7..0000000000 --- a/backends/occa/ceed-occa-qfunction.hpp +++ /dev/null @@ -1,54 +0,0 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. -// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. -// -// SPDX-License-Identifier: BSD-2-Clause -// -// This file is part of CEED: http://github.com/ceed - -#ifndef CEED_OCCA_QFUNCTION_HEADER -#define CEED_OCCA_QFUNCTION_HEADER - -#include "ceed-occa-ceed-object.hpp" -#include "ceed-occa-qfunction-args.hpp" - -namespace ceed { -namespace occa { -class QFunction : public CeedObject { - public: - // Ceed object information - bool ceedIsIdentity; - - // Owned resources - std::string filename; - std::string qFunctionName; - ::occa::kernel qFunctionKernel; - CeedQFunctionContext qFunctionContext; - QFunctionArgs args; - - QFunction(const std::string &source, const std::string &function_name); - - static QFunction *getQFunction(CeedQFunction qf, const bool assertValid = true); - - static QFunction *from(CeedQFunction qf); - static QFunction *from(CeedOperator op); - - ::occa::properties getKernelProps(const CeedInt Q); - - int buildKernel(const CeedInt Q); - std::string getKernelSource(const std::string &kernelName, const CeedInt Q); - - int apply(CeedInt Q, CeedVector *U, CeedVector *V); - - //---[ Ceed Callbacks ]----------- - static int registerCeedFunction(Ceed ceed, CeedQFunction qf, const char *fname, ceed::occa::ceedFunction f); - - static int ceedCreate(CeedQFunction qf); - - static int ceedApply(CeedQFunction qf, CeedInt Q, CeedVector *U, CeedVector *V); - - static int ceedDestroy(CeedQFunction qf); -}; -} // namespace occa -} // namespace ceed - -#endif diff --git a/backends/occa/ceed-occa-qfunctioncontext.cpp b/backends/occa/ceed-occa-qfunctioncontext.cpp deleted file mode 100644 index 017925f5a1..0000000000 --- a/backends/occa/ceed-occa-qfunctioncontext.cpp +++ /dev/null @@ -1,318 +0,0 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. -// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. -// -// SPDX-License-Identifier: BSD-2-Clause -// -// This file is part of CEED: http://github.com/ceed - -#include "ceed-occa-qfunctioncontext.hpp" - -#include - -namespace ceed { -namespace occa { -QFunctionContext::QFunctionContext() : ctxSize(0), hostBuffer(NULL), currentHostBuffer(NULL), syncState(SyncState::none) {} - -QFunctionContext::~QFunctionContext() { - memory.free(); - freeHostCtxBuffer(); -} - -QFunctionContext *QFunctionContext::getQFunctionContext(CeedQFunctionContext ctx, const bool assertValid) { - if (!ctx) { - return NULL; - } - - int ierr; - QFunctionContext *ctx_ = NULL; - - ierr = CeedQFunctionContextGetBackendData(ctx, &ctx_); - if (assertValid) { - CeedOccaFromChk(ierr); - } - - return ctx_; -} - -QFunctionContext *QFunctionContext::from(CeedQFunctionContext ctx) { - QFunctionContext *ctx_ = getQFunctionContext(ctx); - if (!ctx_) { - return NULL; - } - - CeedCallOcca(CeedQFunctionContextGetContextSize(ctx, &ctx_->ctxSize)); - - if (ctx_ != NULL) { - CeedCallOcca(CeedQFunctionContextGetCeed(ctx, &ctx_->ceed)); - } - - return ctx_; -} - -void QFunctionContext::resizeCtx(const size_t ctxSize_) { ctxSize = ctxSize_; } - -void QFunctionContext::resizeCtxMemory(const size_t ctxSize_) { resizeCtxMemory(getDevice(), ctxSize_); } - -void QFunctionContext::resizeCtxMemory(::occa::device device, const size_t ctxSize_) { - if (ctxSize_ != memory.size()) { - memory.free(); - memory = device.malloc(ctxSize_); - } -} - -void QFunctionContext::resizeHostCtxBuffer(const size_t ctxSize_) { - CeedFree(&hostBuffer); - CeedMallocArray(1, ctxSize, &hostBuffer); -} - -void QFunctionContext::setCurrentCtxMemoryIfNeeded() { - if (!currentMemory.isInitialized()) { - resizeCtxMemory(ctxSize); - currentMemory = memory; - } -} - -void QFunctionContext::setCurrentHostCtxBufferIfNeeded() { - if (!currentHostBuffer) { - resizeHostCtxBuffer(ctxSize); - currentHostBuffer = hostBuffer; - } -} - -void QFunctionContext::freeHostCtxBuffer() { - if (hostBuffer) { - CeedFree(&hostBuffer); - } -} - -int QFunctionContext::hasValidData(bool *has_valid_data) const { - (*has_valid_data) = (!!hostBuffer) || (!!currentHostBuffer) || (memory.isInitialized()) || (currentMemory.isInitialized()); - return CEED_ERROR_SUCCESS; -} - -int QFunctionContext::hasBorrowedDataOfType(CeedMemType mem_type, bool *has_borrowed_data_of_type) const { - switch (mem_type) { - case CEED_MEM_HOST: - (*has_borrowed_data_of_type) = !!currentHostBuffer; - break; - case CEED_MEM_DEVICE: - (*has_borrowed_data_of_type) = currentMemory.isInitialized(); - break; - } - return CEED_ERROR_SUCCESS; -} - -int QFunctionContext::setData(CeedMemType mtype, CeedCopyMode cmode, void *data) { - switch (cmode) { - case CEED_COPY_VALUES: - return copyDataValues(mtype, data); - case CEED_OWN_POINTER: - return ownDataPointer(mtype, data); - case CEED_USE_POINTER: - return useDataPointer(mtype, data); - } - return ceedError("Invalid CeedCopyMode passed"); -} - -int QFunctionContext::copyDataValues(CeedMemType mtype, void *data) { - switch (mtype) { - case CEED_MEM_HOST: - setCurrentHostCtxBufferIfNeeded(); - std::memcpy(currentHostBuffer, data, ctxSize); - syncState = SyncState::host; - return CEED_ERROR_SUCCESS; - case CEED_MEM_DEVICE: - setCurrentCtxMemoryIfNeeded(); - currentMemory.copyFrom(dataToMemory(data)); - syncState = SyncState::device; - return CEED_ERROR_SUCCESS; - } - return ceedError("Invalid CeedMemType passed"); -} - -int QFunctionContext::ownDataPointer(CeedMemType mtype, void *data) { - switch (mtype) { - case CEED_MEM_HOST: - freeHostCtxBuffer(); - hostBuffer = currentHostBuffer = data; - syncState = SyncState::host; - return CEED_ERROR_SUCCESS; - case CEED_MEM_DEVICE: - memory.free(); - memory = currentMemory = dataToMemory(data); - syncState = SyncState::device; - return CEED_ERROR_SUCCESS; - } - return ceedError("Invalid CeedMemType passed"); -} - -int QFunctionContext::useDataPointer(CeedMemType mtype, void *data) { - switch (mtype) { - case CEED_MEM_HOST: - freeHostCtxBuffer(); - currentHostBuffer = data; - syncState = SyncState::host; - return CEED_ERROR_SUCCESS; - case CEED_MEM_DEVICE: - memory.free(); - currentMemory = dataToMemory(data); - syncState = SyncState::device; - return CEED_ERROR_SUCCESS; - } - return ceedError("Invalid CeedMemType passed"); -} - -int QFunctionContext::takeData(CeedMemType mtype, void *data) { - if (currentHostBuffer == NULL && currentMemory == ::occa::null) return ceedError("No context data set"); - switch (mtype) { - case CEED_MEM_HOST: - setCurrentHostCtxBufferIfNeeded(); - if (syncState == SyncState::device) { - setCurrentCtxMemoryIfNeeded(); - currentMemory.copyTo(currentHostBuffer); - } - syncState = SyncState::host; - *(void **)data = currentHostBuffer; - hostBuffer = NULL; - currentHostBuffer = NULL; - return CEED_ERROR_SUCCESS; - case CEED_MEM_DEVICE: - setCurrentCtxMemoryIfNeeded(); - if (syncState == SyncState::host) { - setCurrentHostCtxBufferIfNeeded(); - currentMemory.copyFrom(currentHostBuffer); - } - syncState = SyncState::device; - *(void **)data = memoryToData(currentMemory); - memory = ::occa::null; - currentMemory = ::occa::null; - return CEED_ERROR_SUCCESS; - } - return ceedError("Invalid CeedMemType passed"); -} - -int QFunctionContext::getData(CeedMemType mtype, void *data) { - // The passed `data` might be modified before restoring - if (currentHostBuffer == NULL && currentMemory == ::occa::null) return ceedError("No context data set"); - switch (mtype) { - case CEED_MEM_HOST: - setCurrentHostCtxBufferIfNeeded(); - if (syncState == SyncState::device) { - setCurrentCtxMemoryIfNeeded(); - currentMemory.copyTo(currentHostBuffer); - } - syncState = SyncState::host; - *(void **)data = currentHostBuffer; - return CEED_ERROR_SUCCESS; - case CEED_MEM_DEVICE: - setCurrentCtxMemoryIfNeeded(); - if (syncState == SyncState::host) { - setCurrentHostCtxBufferIfNeeded(); - currentMemory.copyFrom(currentHostBuffer); - } - syncState = SyncState::device; - *(void **)data = memoryToData(currentMemory); - return CEED_ERROR_SUCCESS; - } - return ceedError("Invalid CeedMemType passed"); -} - -int QFunctionContext::restoreData() { return CEED_ERROR_SUCCESS; } - -::occa::memory QFunctionContext::getKernelArg() { - setCurrentCtxMemoryIfNeeded(); - if (syncState == SyncState::host) { - setCurrentHostCtxBufferIfNeeded(); - currentMemory.copyFrom(currentHostBuffer); - } - syncState = SyncState::device; - return currentMemory; -} - -//---[ Ceed Callbacks ]----------- -int QFunctionContext::registerCeedFunction(Ceed ceed, CeedQFunctionContext ctx, const char *fname, ceed::occa::ceedFunction f) { - return CeedSetBackendFunction(ceed, "QFunctionContext", ctx, fname, f); -} - -int QFunctionContext::ceedCreate(CeedQFunctionContext ctx) { - Ceed ceed; - CeedCallBackend(CeedQFunctionContextGetCeed(ctx, &ceed)); - - CeedOccaRegisterFunction(ctx, "HasValidData", QFunctionContext::ceedHasValidData); - CeedOccaRegisterFunction(ctx, "HasBorrowedDataOfType", QFunctionContext::ceedHasBorrowedDataOfType); - CeedOccaRegisterFunction(ctx, "SetData", QFunctionContext::ceedSetData); - CeedOccaRegisterFunction(ctx, "TakeData", QFunctionContext::ceedTakeData); - CeedOccaRegisterFunction(ctx, "GetData", QFunctionContext::ceedGetData); - CeedOccaRegisterFunction(ctx, "GetDataRead", QFunctionContext::ceedGetDataRead); - CeedOccaRegisterFunction(ctx, "RestoreData", QFunctionContext::ceedRestoreData); - CeedOccaRegisterFunction(ctx, "Destroy", QFunctionContext::ceedDestroy); - - QFunctionContext *ctx_ = new QFunctionContext(); - CeedCallBackend(CeedQFunctionContextSetBackendData(ctx, ctx_)); - - return CEED_ERROR_SUCCESS; -} - -int QFunctionContext::ceedHasValidData(const CeedQFunctionContext ctx, bool *has_valid_data) { - QFunctionContext *ctx_ = QFunctionContext::from(ctx); - if (!ctx_) { - return staticCeedError("Invalid CeedQFunctionContext passed"); - } - return ctx_->hasValidData(has_valid_data); -} - -int QFunctionContext::ceedHasBorrowedDataOfType(const CeedQFunctionContext ctx, CeedMemType mem_type, bool *has_borrowed_data_of_type) { - QFunctionContext *ctx_ = QFunctionContext::from(ctx); - if (!ctx_) { - return staticCeedError("Invalid CeedQFunctionContext passed"); - } - return ctx_->hasBorrowedDataOfType(mem_type, has_borrowed_data_of_type); -} - -int QFunctionContext::ceedSetData(CeedQFunctionContext ctx, CeedMemType mtype, CeedCopyMode cmode, void *data) { - QFunctionContext *ctx_ = QFunctionContext::from(ctx); - if (!ctx_) { - return staticCeedError("Invalid CeedQFunctionContext passed"); - } - return ctx_->setData(mtype, cmode, data); -} - -int QFunctionContext::ceedTakeData(CeedQFunctionContext ctx, CeedMemType mtype, void *data) { - QFunctionContext *ctx_ = QFunctionContext::from(ctx); - if (!ctx_) { - return staticCeedError("Invalid CeedQFunctionContext passed"); - } - return ctx_->takeData(mtype, data); -} - -int QFunctionContext::ceedGetData(CeedQFunctionContext ctx, CeedMemType mtype, void *data) { - QFunctionContext *ctx_ = QFunctionContext::from(ctx); - if (!ctx_) { - return staticCeedError("Invalid CeedQFunctionContext passed"); - } - return ctx_->getData(mtype, data); -} - -int QFunctionContext::ceedGetDataRead(CeedQFunctionContext ctx, CeedMemType mtype, void *data) { - QFunctionContext *ctx_ = QFunctionContext::from(ctx); - if (!ctx_) { - return staticCeedError("Invalid CeedQFunctionContext passed"); - } - // Todo: Determine if calling getData is sufficient - return ctx_->getData(mtype, data); -} - -int QFunctionContext::ceedRestoreData(CeedQFunctionContext ctx) { - QFunctionContext *ctx_ = QFunctionContext::from(ctx); - if (!ctx_) { - return staticCeedError("Invalid CeedQFunctionContext passed"); - } - return ctx_->restoreData(); -} - -int QFunctionContext::ceedDestroy(CeedQFunctionContext ctx) { - delete getQFunctionContext(ctx, false); - return CEED_ERROR_SUCCESS; -} -} // namespace occa -} // namespace ceed diff --git a/backends/occa/ceed-occa-qfunctioncontext.hpp b/backends/occa/ceed-occa-qfunctioncontext.hpp deleted file mode 100644 index 850eb3adbf..0000000000 --- a/backends/occa/ceed-occa-qfunctioncontext.hpp +++ /dev/null @@ -1,102 +0,0 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. -// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. -// -// SPDX-License-Identifier: BSD-2-Clause -// -// This file is part of CEED: http://github.com/ceed - -#ifndef CEED_OCCA_QFUNCTIONCONTEXT_HEADER -#define CEED_OCCA_QFUNCTIONCONTEXT_HEADER - -#include "ceed-occa-ceed-object.hpp" - -namespace ceed { -namespace occa { -class QFunctionContext : public CeedObject { - public: - // Owned resources - size_t ctxSize; - ::occa::memory memory; - void *hostBuffer; - - // Current resources - ::occa::memory currentMemory; - void *currentHostBuffer; - - // State information - int syncState; - - QFunctionContext(); - - ~QFunctionContext(); - - static QFunctionContext *getQFunctionContext(CeedQFunctionContext ctx, const bool assertValid = true); - - static QFunctionContext *from(CeedQFunctionContext ctx); - - ::occa::memory dataToMemory(const void *data) { - ::occa::memory mem((::occa::modeMemory_t *)data); - return mem; - } - - void *memoryToData(::occa::memory &memory) { return memory.getModeMemory(); } - - void resizeCtx(const size_t ctxSize_); - - void resizeCtxMemory(const size_t ctxSize_); - - void resizeCtxMemory(::occa::device device, const size_t ctxSize_); - - void resizeHostCtxBuffer(const size_t ctxSize_); - - void setCurrentCtxMemoryIfNeeded(); - - void setCurrentHostCtxBufferIfNeeded(); - - void freeHostCtxBuffer(); - - int hasValidData(bool *has_valid_data) const; - - int hasBorrowedDataOfType(CeedMemType mem_type, bool *has_borrowed_data_of_type) const; - - int setData(CeedMemType mtype, CeedCopyMode cmode, void *data); - - int copyDataValues(CeedMemType mtype, void *data); - - int ownDataPointer(CeedMemType mtype, void *data); - - int useDataPointer(CeedMemType mtype, void *data); - - int takeData(CeedMemType mtype, void *data); - - int getData(CeedMemType mtype, void *data); - - int restoreData(); - - ::occa::memory getKernelArg(); - - //---[ Ceed Callbacks ]----------- - static int registerCeedFunction(Ceed ceed, CeedQFunctionContext ctx, const char *fname, ceed::occa::ceedFunction f); - - static int ceedCreate(CeedQFunctionContext ctx); - - static int ceedHasValidData(const CeedQFunctionContext ctx, bool *has_valid_data); - - static int ceedHasBorrowedDataOfType(const CeedQFunctionContext ctx, CeedMemType mem_type, bool *has_borrowed_data_of_type); - - static int ceedSetData(CeedQFunctionContext ctx, CeedMemType mtype, CeedCopyMode cmode, void *data); - - static int ceedTakeData(CeedQFunctionContext ctx, CeedMemType mtype, void *data); - - static int ceedGetData(CeedQFunctionContext ctx, CeedMemType mtype, void *data); - - static int ceedGetDataRead(CeedQFunctionContext ctx, CeedMemType mtype, void *data); - - static int ceedRestoreData(CeedQFunctionContext ctx); - - static int ceedDestroy(CeedQFunctionContext ctx); -}; -} // namespace occa -} // namespace ceed - -#endif diff --git a/backends/occa/ceed-occa-simplex-basis.cpp b/backends/occa/ceed-occa-simplex-basis.cpp deleted file mode 100644 index 747d21afd9..0000000000 --- a/backends/occa/ceed-occa-simplex-basis.cpp +++ /dev/null @@ -1,165 +0,0 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. -// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. -// -// SPDX-License-Identifier: BSD-2-Clause -// -// This file is part of CEED: http://github.com/ceed - -#include "ceed-occa-simplex-basis.hpp" - -#include "ceed-occa-kernels.hpp" - -namespace ceed { -namespace occa { -SimplexBasis::SimplexBasis(CeedBasis basis, CeedInt dim_, CeedInt P_, CeedInt Q_, const CeedScalar *interp_, const CeedScalar *grad_, - const CeedScalar *qWeight_) { - setCeedFields(basis); - - dim = dim_; - P = P_; - Q = Q_; - - ::occa::device device = getDevice(); - - interp = device.malloc(P * Q, interp_); - grad = device.malloc(P * Q * dim, grad_); - qWeight = device.malloc(Q, qWeight_); - - setKernelProperties(); -} - -SimplexBasis::~SimplexBasis() {} - -bool SimplexBasis::isTensorBasis() const { return false; } - -const char *SimplexBasis::getFunctionSource() const { - // TODO: Add gpu function sources when split - return occa_simplex_basis_cpu_function_source; -} - -void SimplexBasis::setKernelProperties() { - kernelProperties["defines/CeedInt"] = ::occa::dtype::get().name(); - kernelProperties["defines/CeedScalar"] = ::occa::dtype::get().name(); - kernelProperties["defines/DIM"] = dim; - kernelProperties["defines/Q"] = Q; - kernelProperties["defines/P"] = P; - kernelProperties["defines/MAX_PQ"] = P > Q ? P : Q; - kernelProperties["defines/BASIS_COMPONENT_COUNT"] = ceedComponentCount; - if (usingGpuDevice()) { - kernelProperties["defines/ELEMENTS_PER_BLOCK"] = (Q <= 1024) ? (1024 / Q) : 1; - } -} - -::occa::kernel SimplexBasis::buildKernel(const std::string &kernelName) { - std::string kernelSource; - if (usingGpuDevice()) { - kernelSource = occa_simplex_basis_gpu_source; - } else { - kernelSource = occa_simplex_basis_cpu_function_source; - kernelSource += '\n'; - kernelSource += occa_simplex_basis_cpu_kernel_source; - } - - return getDevice().buildKernelFromString(kernelSource, kernelName, kernelProperties); -} - -int SimplexBasis::applyInterp(const CeedInt elementCount, const bool transpose, Vector &U, Vector &V) { - if (transpose) { - if (!interpTKernel.isInitialized()) { - kernelProperties["defines/TRANSPOSE"] = transpose; - interpTKernel = buildKernel("interp"); - } - - interpTKernel(elementCount, interp, U.getConstKernelArg(), V.getKernelArg()); - } else { - if (!interpKernel.isInitialized()) { - kernelProperties["defines/TRANSPOSE"] = transpose; - interpKernel = buildKernel("interp"); - } - - interpKernel(elementCount, interp, U.getConstKernelArg(), V.getKernelArg()); - } - return CEED_ERROR_SUCCESS; -} - -int SimplexBasis::applyGrad(const CeedInt elementCount, const bool transpose, Vector &U, Vector &V) { - if (transpose) { - if (!gradTKernel.isInitialized()) { - kernelProperties["defines/TRANSPOSE"] = transpose; - gradTKernel = buildKernel("grad"); - } - - gradTKernel(elementCount, grad, U.getConstKernelArg(), V.getKernelArg()); - } else { - if (!gradKernel.isInitialized()) { - kernelProperties["defines/TRANSPOSE"] = transpose; - gradKernel = buildKernel("grad"); - } - - gradKernel(elementCount, grad, U.getConstKernelArg(), V.getKernelArg()); - } - return CEED_ERROR_SUCCESS; -} - -int SimplexBasis::applyWeight(const CeedInt elementCount, Vector &W) { - if (!weightKernel.isInitialized()) { - weightKernel = buildKernel("weight"); - } - weightKernel(elementCount, qWeight, W.getKernelArg()); - - return CEED_ERROR_SUCCESS; -} - -int SimplexBasis::apply(const CeedInt elementCount, CeedTransposeMode tmode, CeedEvalMode emode, Vector *U, Vector *V) { - const bool transpose = tmode == CEED_TRANSPOSE; - - if ((dim < 1) || (3 < dim)) { - return ceedError("Backend only supports dimensions: 1, 2, and 3"); - } - - // Check arguments - if (emode != CEED_EVAL_WEIGHT) { - if (!U) { - return ceedError("Incorrect CeedVector input: U"); - } - } - if (!V) { - return ceedError("Incorrect CeedVector input: V"); - } - - try { - // Apply kernel - switch (emode) { - case CEED_EVAL_INTERP: - return applyInterp(elementCount, transpose, *U, *V); - case CEED_EVAL_GRAD: - return applyGrad(elementCount, transpose, *U, *V); - case CEED_EVAL_WEIGHT: - return applyWeight(elementCount, *V); - default: - return ceedError("Backend does not support given simplex eval mode"); - } - } catch (::occa::exception &exc) { - // Handle kernel build errors the CEED way - CeedHandleOccaException(exc); - } - - return CEED_ERROR_SUCCESS; -} - -//---[ Ceed Callbacks ]------------- -int SimplexBasis::ceedCreate(CeedElemTopology topology, CeedInt dim, CeedInt ndof, CeedInt nquad, const CeedScalar *interp, const CeedScalar *grad, - const CeedScalar *qref, const CeedScalar *qWeight, CeedBasis basis) { - Ceed ceed; - CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); - - SimplexBasis *basis_ = new SimplexBasis(basis, dim, ndof, nquad, interp, grad, qWeight); - CeedCallBackend(CeedBasisSetData(basis, basis_)); - - CeedOccaRegisterFunction(basis, "Apply", Basis::ceedApply); - CeedOccaRegisterFunction(basis, "Destroy", Basis::ceedDestroy); - - return CEED_ERROR_SUCCESS; -} -} // namespace occa -} // namespace ceed diff --git a/backends/occa/ceed-occa-simplex-basis.hpp b/backends/occa/ceed-occa-simplex-basis.hpp deleted file mode 100644 index c27b6d0a88..0000000000 --- a/backends/occa/ceed-occa-simplex-basis.hpp +++ /dev/null @@ -1,57 +0,0 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. -// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. -// -// SPDX-License-Identifier: BSD-2-Clause -// -// This file is part of CEED: http://github.com/ceed - -#ifndef CEED_OCCA_SIMPLEXBASIS_HEADER -#define CEED_OCCA_SIMPLEXBASIS_HEADER - -#include "ceed-occa-basis.hpp" - -namespace ceed { -namespace occa { -class SimplexBasis : public Basis { - public: - ::occa::memory interp; - ::occa::memory grad; - ::occa::memory qWeight; - - ::occa::json kernelProperties; - ::occa::kernel interpKernel; - ::occa::kernel interpTKernel; - ::occa::kernel gradKernel; - ::occa::kernel gradTKernel; - ::occa::kernel weightKernel; - - SimplexBasis(CeedBasis basis, CeedInt dim, CeedInt P_, CeedInt Q_, const CeedScalar *interp_, const CeedScalar *grad_, const CeedScalar *qWeight_); - - ~SimplexBasis(); - - bool isTensorBasis() const; - - const char *getFunctionSource() const; - - void setKernelProperties(); - - std::string getKernelSource() const; - - ::occa::kernel buildKernel(const std::string &kernelName); - - int applyInterp(const CeedInt elementCount, const bool transpose, Vector &U, Vector &V); - - int applyGrad(const CeedInt elementCount, const bool transpose, Vector &U, Vector &V); - - int applyWeight(const CeedInt elementCount, Vector &W); - - int apply(const CeedInt elementCount, CeedTransposeMode tmode, CeedEvalMode emode, Vector *u, Vector *v); - - //---[ Ceed Callbacks ]----------- - static int ceedCreate(CeedElemTopology topology, CeedInt dim, CeedInt ndof, CeedInt nquad, const CeedScalar *interp, const CeedScalar *grad, - const CeedScalar *qref, const CeedScalar *qWeight, CeedBasis basis); -}; -} // namespace occa -} // namespace ceed - -#endif diff --git a/backends/occa/ceed-occa-tensor-basis.cpp b/backends/occa/ceed-occa-tensor-basis.cpp deleted file mode 100644 index 553672170c..0000000000 --- a/backends/occa/ceed-occa-tensor-basis.cpp +++ /dev/null @@ -1,236 +0,0 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. -// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. -// -// SPDX-License-Identifier: BSD-2-Clause -// -// This file is part of CEED: http://github.com/ceed - -#include "ceed-occa-tensor-basis.hpp" - -#include "ceed-occa-kernels.hpp" - -namespace ceed { -namespace occa { -TensorBasis::TensorBasis(CeedBasis basis, CeedInt dim_, CeedInt P1D_, CeedInt Q1D_, const CeedScalar *interp1D_, const CeedScalar *grad1D_, - const CeedScalar *qWeight1D_) - : P1D(P1D_), Q1D(Q1D_) { - setCeedFields(basis); - - dim = dim_; - - P = P1D; - Q = Q1D; - for (int i = 1; i < dim; ++i) { - P *= P1D; - Q *= Q1D; - } - - ::occa::device device = getDevice(); - - interp1D = device.malloc(P1D * Q1D, interp1D_); - grad1D = device.malloc(P1D * Q1D, grad1D_); - qWeight1D = device.malloc(Q1D, qWeight1D_); - - setKernelProperties(); -} - -TensorBasis::~TensorBasis() {} - -bool TensorBasis::isTensorBasis() const { return true; } - -void TensorBasis::setKernelProperties() { - kernelProperties["defines/CeedInt"] = ::occa::dtype::get().name(); - kernelProperties["defines/CeedScalar"] = ::occa::dtype::get().name(); - kernelProperties["defines/Q1D"] = Q1D; - kernelProperties["defines/P1D"] = P1D; - kernelProperties["defines/BASIS_COMPONENT_COUNT"] = ceedComponentCount; - if (usingGpuDevice()) { - kernelProperties["defines/MAX_PQ"] = (Q1D > P1D) ? Q1D : P1D; - } -} - -const char *TensorBasis::getFunctionSource() const { - // TODO: Add gpu function sources when split - const char *cpuFunctionSources[3] = {occa_tensor_basis_1d_cpu_function_source, occa_tensor_basis_2d_cpu_function_source, - occa_tensor_basis_3d_cpu_function_source}; - return cpuFunctionSources[dim - 1]; -} - -std::string TensorBasis::getKernelSource() const { - const char *cpuFunctionSources[3] = {occa_tensor_basis_1d_cpu_function_source, occa_tensor_basis_2d_cpu_function_source, - occa_tensor_basis_3d_cpu_function_source}; - const char *cpuKernelSources[3] = {occa_tensor_basis_1d_cpu_kernel_source, occa_tensor_basis_2d_cpu_kernel_source, - occa_tensor_basis_3d_cpu_kernel_source}; - const char *gpuKernelSources[3] = {occa_tensor_basis_1d_gpu_source, occa_tensor_basis_2d_gpu_source, occa_tensor_basis_3d_gpu_source}; - - std::string kernelSource; - if (usingGpuDevice()) { - kernelSource = gpuKernelSources[dim - 1]; - } else { - kernelSource = cpuFunctionSources[dim - 1]; - kernelSource += '\n'; - kernelSource += cpuKernelSources[dim - 1]; - } - return kernelSource; -} - -::occa::kernel TensorBasis::buildKernel(const std::string &kernelName) { - std::string kernelSource = getKernelSource(); - return getDevice().buildKernelFromString(kernelSource, kernelName, kernelProperties); -} - -int TensorBasis::applyInterp(const CeedInt elementCount, const bool transpose, Vector &U, Vector &V) { - if (transpose) { - if (!interpTKernel.isInitialized()) { - kernelProperties["defines/TRANSPOSE"] = transpose; - kernelProperties["defines/ELEMENTS_PER_BLOCK"] = elementsPerBlockInterp(); - interpTKernel = buildKernel("interp"); - } - interpTKernel(elementCount, interp1D, U.getConstKernelArg(), V.getKernelArg()); - } else { - if (!interpKernel.isInitialized()) { - kernelProperties["defines/TRANSPOSE"] = transpose; - kernelProperties["defines/ELEMENTS_PER_BLOCK"] = elementsPerBlockInterp(); - interpKernel = buildKernel("interp"); - } - interpKernel(elementCount, interp1D, U.getConstKernelArg(), V.getKernelArg()); - } - return CEED_ERROR_SUCCESS; -} - -int TensorBasis::elementsPerBlockInterp() const { - int elementsPerBlock; - if (dim == 1) { - elementsPerBlock = 32; - } else if (dim == 2) { - const CeedInt blocksByQ[7] = {0, 32, 8, 6, 4, 2, 8}; - if (Q1D < 7) { - elementsPerBlock = blocksByQ[Q1D]; - } else { - elementsPerBlock = 1; - } - } else { - elementsPerBlock = 1; - } - return elementsPerBlock; -} - -int TensorBasis::applyGrad(const CeedInt elementCount, const bool transpose, Vector &U, Vector &V) { - if (transpose) { - if (!gradTKernel.isInitialized()) { - kernelProperties["defines/TRANSPOSE"] = transpose; - kernelProperties["defines/ELEMENTS_PER_BLOCK"] = elementsPerBlockGrad(); - gradTKernel = buildKernel("grad"); - } - gradTKernel(elementCount, interp1D, grad1D, U.getConstKernelArg(), V.getKernelArg()); - } else { - if (!gradKernel.isInitialized()) { - kernelProperties["defines/TRANSPOSE"] = transpose; - kernelProperties["defines/ELEMENTS_PER_BLOCK"] = elementsPerBlockGrad(); - gradKernel = buildKernel("grad"); - } - gradKernel(elementCount, interp1D, grad1D, U.getConstKernelArg(), V.getKernelArg()); - } - return CEED_ERROR_SUCCESS; -} - -int TensorBasis::elementsPerBlockGrad() const { - int elementsPerBlock; - if (dim == 1) { - elementsPerBlock = 32; - } else if (dim == 2) { - const CeedInt blocksByQ[7] = {0, 32, 8, 6, 4, 2, 8}; - if (Q1D < 7) { - elementsPerBlock = blocksByQ[Q1D]; - } else { - elementsPerBlock = 1; - } - } else { - elementsPerBlock = 1; - } - return elementsPerBlock; -} - -int TensorBasis::applyWeight(const CeedInt elementCount, Vector &W) { - if (!weightKernel.isInitialized()) { - kernelProperties["defines/ELEMENTS_PER_BLOCK"] = elementsPerBlockWeight(); - weightKernel = buildKernel("weight"); - } - weightKernel(elementCount, qWeight1D, W.getKernelArg()); - - return CEED_ERROR_SUCCESS; -} - -int TensorBasis::elementsPerBlockWeight() const { - int elementsPerBlock; - if (dim == 1) { - elementsPerBlock = 32 / Q1D; - } else if (dim == 2) { - if ((Q1D * Q1D) > 32) { - elementsPerBlock = 1; - } else { - elementsPerBlock = 32 / (Q1D * Q1D); - } - } else { - elementsPerBlock = Q1D; - } - return elementsPerBlock; -} - -int TensorBasis::apply(const CeedInt elementCount, CeedTransposeMode tmode, CeedEvalMode emode, Vector *U, Vector *V) { - const bool transpose = tmode == CEED_TRANSPOSE; - - if ((dim < 1) || (3 < dim)) { - return ceedError("Backend only supports dimensions: 1, 2, and 3"); - } - - // Check arguments - if (emode != CEED_EVAL_WEIGHT) { - if (!U) { - return ceedError("Incorrect CeedVector input: U"); - } - } - if (!V) { - return ceedError("Incorrect CeedVector input: V"); - } - - try { - // Apply kernel - switch (emode) { - case CEED_EVAL_INTERP: - return applyInterp(elementCount, transpose, *U, *V); - case CEED_EVAL_GRAD: - return applyGrad(elementCount, transpose, *U, *V); - case CEED_EVAL_WEIGHT: - return applyWeight(elementCount, *V); - default: - return ceedError("Backend does not support given tensor eval mode"); - } - } catch (::occa::exception &exc) { - // Handle kernel build errors the CEED way - CeedHandleOccaException(exc); - } - - return CEED_ERROR_SUCCESS; -} - -//---[ Ceed Callbacks ]------------- -int TensorBasis::ceedCreate(CeedInt dim, CeedInt P1D, CeedInt Q1D, const CeedScalar *interp1D, const CeedScalar *grad1D, const CeedScalar *qref1D, - const CeedScalar *qWeight1D, CeedBasis basis) { - Ceed ceed; - CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); - - if (Q1D < P1D && Context::from(ceed)->usingGpuDevice()) { - return staticCeedError("(OCCA) Backend does not implement underintegrated basis"); - } - - TensorBasis *basis_ = new TensorBasis(basis, dim, P1D, Q1D, interp1D, grad1D, qWeight1D); - CeedCallBackend(CeedBasisSetData(basis, basis_)); - - CeedOccaRegisterFunction(basis, "Apply", Basis::ceedApply); - CeedOccaRegisterFunction(basis, "Destroy", Basis::ceedDestroy); - - return CEED_ERROR_SUCCESS; -} -} // namespace occa -} // namespace ceed diff --git a/backends/occa/ceed-occa-tensor-basis.hpp b/backends/occa/ceed-occa-tensor-basis.hpp deleted file mode 100644 index 35e345b8c9..0000000000 --- a/backends/occa/ceed-occa-tensor-basis.hpp +++ /dev/null @@ -1,64 +0,0 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. -// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. -// -// SPDX-License-Identifier: BSD-2-Clause -// -// This file is part of CEED: http://github.com/ceed - -#ifndef CEED_OCCA_TENSORBASIS_HEADER -#define CEED_OCCA_TENSORBASIS_HEADER - -#include "ceed-occa-basis.hpp" - -namespace ceed { -namespace occa { -class TensorBasis : public Basis { - public: - CeedInt P1D; - CeedInt Q1D; - ::occa::memory interp1D; - ::occa::memory grad1D; - ::occa::memory qWeight1D; - - ::occa::json kernelProperties; - ::occa::kernel interpKernel; - ::occa::kernel interpTKernel; - ::occa::kernel gradKernel; - ::occa::kernel gradTKernel; - ::occa::kernel weightKernel; - - TensorBasis(CeedBasis basis, CeedInt dim_, CeedInt P1D_, CeedInt Q1D_, const CeedScalar *interp1D_, const CeedScalar *grad1D_, - const CeedScalar *qWeight1D_); - - ~TensorBasis(); - - bool isTensorBasis() const; - - const char *getFunctionSource() const; - - std::string getKernelSource() const; - - void setKernelProperties(); - - int elementsPerBlockInterp() const; - int elementsPerBlockGrad() const; - int elementsPerBlockWeight() const; - - ::occa::kernel buildKernel(const std::string &kernelName); - - int applyInterp(const CeedInt elementCount, const bool transpose, Vector &U, Vector &V); - - int applyGrad(const CeedInt elementCount, const bool transpose, Vector &U, Vector &V); - - int applyWeight(const CeedInt elementCount, Vector &W); - - int apply(const CeedInt elementCount, CeedTransposeMode tmode, CeedEvalMode emode, Vector *U, Vector *V); - - //---[ Ceed Callbacks ]----------- - static int ceedCreate(CeedInt dim, CeedInt P1D, CeedInt Q1D, const CeedScalar *interp1D, const CeedScalar *grad1D, const CeedScalar *qref1D, - const CeedScalar *qWeight1D, CeedBasis basis); -}; -} // namespace occa -} // namespace ceed - -#endif diff --git a/backends/occa/ceed-occa-types.hpp b/backends/occa/ceed-occa-types.hpp deleted file mode 100644 index cc56791f85..0000000000 --- a/backends/occa/ceed-occa-types.hpp +++ /dev/null @@ -1,60 +0,0 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. -// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. -// -// SPDX-License-Identifier: BSD-2-Clause -// -// This file is part of CEED: http://github.com/ceed - -#ifndef CEED_OCCA_TYPES_HEADER -#define CEED_OCCA_TYPES_HEADER - -#include - -#include - -#define CeedOccaFromChk(ierr) \ - do { \ - if (ierr) { \ - return NULL; \ - } \ - } while (0) - -#define CeedCallOcca(...) \ - do { \ - int ierr_q_ = __VA_ARGS__; \ - CeedOccaFromChk(ierr_q_); \ - } while (0); - -#define CeedOccaValidChk(isValidVar, ierr) \ - do { \ - if (ierr) { \ - isValidVar = false; \ - return; \ - } \ - } while (0) - -#define CeedCallOccaValid(isValidVar, ...) \ - do { \ - int ierr_q_ = __VA_ARGS__; \ - CeedOccaValidChk(isValidVar, ierr_q_); \ - } while (0); - -#define CeedHandleOccaException(exc) \ - do { \ - std::string error = exc.toString(); \ - return CeedError(ceed, CEED_ERROR_BACKEND, error.c_str()); \ - } while (0) - -#define CeedOccaCastRegisterFunction(func) (ceed::occa::ceedFunction)(void *) func - -#define CeedOccaRegisterBaseFunction(name, func) CeedCallBackend(registerCeedFunction(ceed, name, CeedOccaCastRegisterFunction(func))); - -#define CeedOccaRegisterFunction(object, name, func) CeedCallBackend(registerCeedFunction(ceed, object, name, CeedOccaCastRegisterFunction(func))); - -namespace ceed { -namespace occa { -typedef int (*ceedFunction)(); -} -} // namespace ceed - -#endif diff --git a/backends/occa/ceed-occa-vector.cpp b/backends/occa/ceed-occa-vector.cpp deleted file mode 100644 index 0a5c51a28a..0000000000 --- a/backends/occa/ceed-occa-vector.cpp +++ /dev/null @@ -1,460 +0,0 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. -// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. -// -// SPDX-License-Identifier: BSD-2-Clause -// -// This file is part of CEED: http://github.com/ceed - -#include "ceed-occa-vector.hpp" - -#include - -#include "ceed-occa-kernels.hpp" - -namespace ceed { -namespace occa { -Vector::Vector() : length(0), hostBufferLength(0), hostBuffer(NULL), currentHostBuffer(NULL), syncState(SyncState::none) {} - -Vector::~Vector() { - memory.free(); - freeHostBuffer(); -} - -int Vector::hasValidArray(bool *has_valid_array) { - (*has_valid_array) = (!!hostBuffer) || (!!currentHostBuffer) || (memory.isInitialized()) || (currentMemory.isInitialized()); - return CEED_ERROR_SUCCESS; -} - -int Vector::hasBorrowedArrayOfType(CeedMemType mem_type, bool *has_borrowed_array_of_type) { - switch (mem_type) { - case CEED_MEM_HOST: - (*has_borrowed_array_of_type) = !!currentHostBuffer; - break; - case CEED_MEM_DEVICE: - (*has_borrowed_array_of_type) = currentMemory.isInitialized(); - break; - } - return CEED_ERROR_SUCCESS; -} - -Vector *Vector::getVector(CeedVector vec, const bool assertValid) { - if (!vec || vec == CEED_VECTOR_NONE) { - return NULL; - } - - int ierr; - Vector *vector = NULL; - - ierr = CeedVectorGetData(vec, &vector); - if (assertValid) { - CeedOccaFromChk(ierr); - } - - return vector; -} - -Vector *Vector::from(CeedVector vec) { - Vector *vector = getVector(vec); - if (!vector) { - return NULL; - } - - CeedCallOcca(CeedVectorGetCeed(vec, &vector->ceed)); - CeedCallOcca(CeedVectorGetLength(vec, &vector->length)); - - return vector; -} - -void Vector::resize(const CeedSize length_) { length = length_; } - -void Vector::resizeMemory(const CeedSize length_) { resizeMemory(getDevice(), length_); } - -void Vector::resizeMemory(::occa::device device, const CeedSize length_) { - if (length_ != (CeedSize)memory.length()) { - memory.free(); - memory = device.malloc(length_); - } -} - -void Vector::resizeHostBuffer(const CeedSize length_) { - if (length_ != hostBufferLength) { - delete hostBuffer; - hostBuffer = new CeedScalar[length_]; - } -} - -void Vector::setCurrentMemoryIfNeeded() { - if (!currentMemory.isInitialized()) { - resizeMemory(length); - currentMemory = memory; - } -} - -void Vector::setCurrentHostBufferIfNeeded() { - if (!currentHostBuffer) { - resizeHostBuffer(length); - currentHostBuffer = hostBuffer; - } -} - -void Vector::freeHostBuffer() { - if (hostBuffer) { - delete[] hostBuffer; - hostBuffer = NULL; - } -} - -int Vector::setValue(CeedScalar value) { - // Prioritize keeping data in the device - if (syncState & SyncState::device) { - setCurrentMemoryIfNeeded(); - if (!setValueKernel.isInitialized()) { - ::occa::json kernelProperties; - CeedInt constexpr block_size{256}; - kernelProperties["defines/CeedInt"] = ::occa::dtype::get().name(); - kernelProperties["defines/CeedScalar"] = ::occa::dtype::get().name(); - kernelProperties["defines/BLOCK_SIZE"] = block_size; - - std::string kernelSource = occa_set_value_source; - setValueKernel = getDevice().buildKernelFromString(kernelSource, "setValue", kernelProperties); - setValueKernel(currentMemory, value, length); - } - syncState = SyncState::device; - } else { - setCurrentHostBufferIfNeeded(); - for (CeedInt i = 0; i < length; ++i) { - currentHostBuffer[i] = value; - } - syncState = SyncState::host; - } - return CEED_ERROR_SUCCESS; -} - -int Vector::setArray(CeedMemType mtype, CeedCopyMode cmode, CeedScalar *array) { - switch (cmode) { - case CEED_COPY_VALUES: - return copyArrayValues(mtype, array); - case CEED_OWN_POINTER: - return ownArrayPointer(mtype, array); - case CEED_USE_POINTER: - return useArrayPointer(mtype, array); - } - return ceedError("Invalid CeedCopyMode passed"); -} - -int Vector::takeArray(CeedMemType mtype, CeedScalar **array) { - switch (mtype) { - case CEED_MEM_HOST: - setCurrentHostBufferIfNeeded(); - if (syncState == SyncState::device) { - setCurrentMemoryIfNeeded(); - currentMemory.copyTo(currentHostBuffer); - } - *array = currentHostBuffer; - hostBuffer = NULL; - currentHostBuffer = NULL; - - syncState = SyncState::host; - return CEED_ERROR_SUCCESS; - case CEED_MEM_DEVICE: - setCurrentMemoryIfNeeded(); - if (syncState == SyncState::host) { - setCurrentHostBufferIfNeeded(); - currentMemory.copyFrom(currentHostBuffer); - } - *array = memoryToArray(currentMemory); - memory = ::occa::null; - currentMemory = ::occa::null; - - syncState = SyncState::device; - return CEED_ERROR_SUCCESS; - } - return ceedError("Invalid CeedMemType passed"); -} - -int Vector::copyArrayValues(CeedMemType mtype, CeedScalar *array) { - switch (mtype) { - case CEED_MEM_HOST: - setCurrentHostBufferIfNeeded(); - if (array) { - std::memcpy(currentHostBuffer, array, length * sizeof(CeedScalar)); - } - syncState = SyncState::host; - return CEED_ERROR_SUCCESS; - case CEED_MEM_DEVICE: - setCurrentMemoryIfNeeded(); - if (array) { - currentMemory.copyFrom(arrayToMemory(array)); - } - syncState = SyncState::device; - return CEED_ERROR_SUCCESS; - } - return ceedError("Invalid CeedMemType passed"); -} - -int Vector::ownArrayPointer(CeedMemType mtype, CeedScalar *array) { - switch (mtype) { - case CEED_MEM_HOST: - freeHostBuffer(); - hostBuffer = currentHostBuffer = array; - syncState = SyncState::host; - return CEED_ERROR_SUCCESS; - case CEED_MEM_DEVICE: - memory.free(); - memory = currentMemory = arrayToMemory(array); - syncState = SyncState::device; - return CEED_ERROR_SUCCESS; - } - return ceedError("Invalid CeedMemType passed"); -} - -int Vector::useArrayPointer(CeedMemType mtype, CeedScalar *array) { - switch (mtype) { - case CEED_MEM_HOST: - freeHostBuffer(); - currentHostBuffer = array; - syncState = SyncState::host; - return CEED_ERROR_SUCCESS; - case CEED_MEM_DEVICE: - memory.free(); - currentMemory = arrayToMemory(array); - syncState = SyncState::device; - return CEED_ERROR_SUCCESS; - } - return ceedError("Invalid CeedMemType passed"); -} - -int Vector::getArray(CeedMemType mtype, CeedScalar **array) { - // The passed `array` might be modified before restoring - // so we can't set sync state to SyncState::all - switch (mtype) { - case CEED_MEM_HOST: - setCurrentHostBufferIfNeeded(); - if (syncState == SyncState::device) { - setCurrentMemoryIfNeeded(); - currentMemory.copyTo(currentHostBuffer); - } - syncState = SyncState::host; - *array = currentHostBuffer; - return CEED_ERROR_SUCCESS; - case CEED_MEM_DEVICE: - setCurrentMemoryIfNeeded(); - if (syncState == SyncState::host) { - setCurrentHostBufferIfNeeded(); - currentMemory.copyFrom(currentHostBuffer); - } - syncState = SyncState::device; - *array = memoryToArray(currentMemory); - return CEED_ERROR_SUCCESS; - } - return ceedError("Invalid CeedMemType passed"); -} - -int Vector::getReadOnlyArray(CeedMemType mtype, CeedScalar **array) { - const bool willBeFullySynced = - ((syncState == SyncState::device && mtype == CEED_MEM_HOST) || (syncState == SyncState::host && mtype == CEED_MEM_DEVICE)); - - const int error = getArray(mtype, const_cast(array)); - // Take advantage the vector will be fully synced - if (!error && willBeFullySynced) { - syncState = SyncState::all; - } - - return error; -} - -int Vector::getWriteOnlyArray(CeedMemType mtype, CeedScalar **array) { - // const bool willBeFullySynced = ( - // (syncState == SyncState::device && mtype == CEED_MEM_HOST) || - // (syncState == SyncState::host && mtype == CEED_MEM_DEVICE) - // ); - - const int error = getArray(mtype, const_cast(array)); - // // Take advantage the vector will be fully synced - // if (!error && willBeFullySynced) { - // syncState = SyncState::all; - // } - - return error; -} - -int Vector::restoreArray(CeedScalar **array) { return CEED_ERROR_SUCCESS; } - -int Vector::restoreReadOnlyArray(CeedScalar **array) { return CEED_ERROR_SUCCESS; } - -::occa::memory Vector::getKernelArg() { - setCurrentMemoryIfNeeded(); - if (syncState == SyncState::host) { - setCurrentHostBufferIfNeeded(); - currentMemory.copyFrom(currentHostBuffer); - } - syncState = SyncState::device; - return currentMemory; -} - -::occa::memory Vector::getConstKernelArg() { - setCurrentMemoryIfNeeded(); - if (syncState == SyncState::host) { - setCurrentHostBufferIfNeeded(); - currentMemory.copyFrom(currentHostBuffer); - syncState = SyncState::all; - } - return currentMemory; -} - -void Vector::printValues(const std::string &name) { - CeedScalar *values; - getReadOnlyArray(CEED_MEM_HOST, &values); - - std::cout << std::setprecision(8) << "Vector: " << name << std::endl << " - Values: " << std::endl; - - for (int i = 0; i < length; ++i) { - printf(" %12.8f\n", values[i]); - } -} - -void Vector::printNonZeroValues(const std::string &name) { - CeedScalar *values; - getReadOnlyArray(CEED_MEM_HOST, &values); - - std::cout << std::setprecision(8) << "Vector: " << name << std::endl << " - Non-zero values: " << std::endl; - - for (int i = 0; i < length; ++i) { - if (fabs(values[i]) > 1e-8) { - printf(" %d: %12.8f\n", i, values[i]); - } - } -} - -void Vector::printSummary(const std::string &name) { - CeedScalar *values; - getReadOnlyArray(CEED_MEM_HOST, &values); - - CeedScalar minValue = values[0]; - CeedScalar maxValue = values[0]; - - for (int i = 0; i < length; ++i) { - const CeedScalar value = values[i]; - minValue = minValue < value ? minValue : value; - maxValue = maxValue > value ? maxValue : value; - } - - std::cout << std::setprecision(8) << "Vector: " << name << std::endl - << " - Length: " << length << std::endl - << " - Min : " << minValue << std::endl - << " - Max : " << maxValue << std::endl; -} - -//---[ Ceed Callbacks ]----------- -int Vector::registerCeedFunction(Ceed ceed, CeedVector vec, const char *fname, ceed::occa::ceedFunction f) { - return CeedSetBackendFunction(ceed, "Vector", vec, fname, f); -} - -int Vector::ceedCreate(CeedSize length, CeedVector vec) { - Ceed ceed; - CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); - - CeedOccaRegisterFunction(vec, "HasValidArray", Vector::ceedHasValidArray); - CeedOccaRegisterFunction(vec, "HasBorrowedArrayOfType", Vector::ceedHasBorrowedArrayOfType); - CeedOccaRegisterFunction(vec, "SetValue", Vector::ceedSetValue); - CeedOccaRegisterFunction(vec, "SetArray", Vector::ceedSetArray); - CeedOccaRegisterFunction(vec, "TakeArray", Vector::ceedTakeArray); - CeedOccaRegisterFunction(vec, "GetArray", Vector::ceedGetArray); - CeedOccaRegisterFunction(vec, "GetArrayRead", Vector::ceedGetArrayRead); - CeedOccaRegisterFunction(vec, "GetArrayWrite", Vector::ceedGetArrayWrite); - CeedOccaRegisterFunction(vec, "RestoreArray", Vector::ceedRestoreArray); - CeedOccaRegisterFunction(vec, "RestoreArrayRead", Vector::ceedRestoreArrayRead); - CeedOccaRegisterFunction(vec, "Destroy", Vector::ceedDestroy); - - Vector *vector = new Vector(); - CeedCallBackend(CeedVectorSetData(vec, vector)); - - return CEED_ERROR_SUCCESS; -} - -int Vector::ceedHasValidArray(CeedVector vec, bool *has_valid_array) { - Vector *vector = Vector::from(vec); - if (!vector) { - return staticCeedError("Invalid CeedVector passed"); - } - return vector->hasValidArray(has_valid_array); -} - -int Vector::ceedHasBorrowedArrayOfType(CeedVector vec, CeedMemType mem_type, bool *has_borrowed_array_of_type) { - Vector *vector = Vector::from(vec); - if (!vector) { - return staticCeedError("Invalid CeedVector passed"); - } - return vector->hasBorrowedArrayOfType(mem_type, has_borrowed_array_of_type); -} - -int Vector::ceedSetValue(CeedVector vec, CeedScalar value) { - Vector *vector = Vector::from(vec); - if (!vector) { - return staticCeedError("Invalid CeedVector passed"); - } - return vector->setValue(value); -} - -int Vector::ceedSetArray(CeedVector vec, CeedMemType mtype, CeedCopyMode cmode, CeedScalar *array) { - Vector *vector = Vector::from(vec); - if (!vector) { - return staticCeedError("Invalid CeedVector passed"); - } - return vector->setArray(mtype, cmode, array); -} - -int Vector::ceedTakeArray(CeedVector vec, CeedMemType mtype, CeedScalar **array) { - Vector *vector = Vector::from(vec); - if (!vector) { - return staticCeedError("Invalid CeedVector passed"); - } - return vector->takeArray(mtype, array); -} - -int Vector::ceedGetArray(CeedVector vec, CeedMemType mtype, CeedScalar **array) { - Vector *vector = Vector::from(vec); - if (!vector) { - return staticCeedError("Invalid CeedVector passed"); - } - return vector->getArray(mtype, array); -} - -int Vector::ceedGetArrayRead(CeedVector vec, CeedMemType mtype, CeedScalar **array) { - Vector *vector = Vector::from(vec); - if (!vector) { - return staticCeedError("Invalid CeedVector passed"); - } - return vector->getReadOnlyArray(mtype, array); -} - -int Vector::ceedGetArrayWrite(CeedVector vec, CeedMemType mtype, CeedScalar **array) { - Vector *vector = Vector::from(vec); - if (!vector) { - return staticCeedError("Invalid CeedVector passed"); - } - return vector->getWriteOnlyArray(mtype, array); -} - -int Vector::ceedRestoreArray(CeedVector vec, CeedScalar **array) { - Vector *vector = Vector::from(vec); - if (!vector) { - return staticCeedError("Invalid CeedVector passed"); - } - return vector->restoreArray(array); -} - -int Vector::ceedRestoreArrayRead(CeedVector vec, CeedScalar **array) { - Vector *vector = Vector::from(vec); - if (!vector) { - return staticCeedError("Invalid CeedVector passed"); - } - return vector->restoreReadOnlyArray(array); -} - -int Vector::ceedDestroy(CeedVector vec) { - delete getVector(vec, false); - return CEED_ERROR_SUCCESS; -} -} // namespace occa -} // namespace ceed diff --git a/backends/occa/ceed-occa-vector.hpp b/backends/occa/ceed-occa-vector.hpp deleted file mode 100644 index 37abf5d7fa..0000000000 --- a/backends/occa/ceed-occa-vector.hpp +++ /dev/null @@ -1,133 +0,0 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. -// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. -// -// SPDX-License-Identifier: BSD-2-Clause -// -// This file is part of CEED: http://github.com/ceed - -#ifndef CEED_OCCA_VECTOR_HEADER -#define CEED_OCCA_VECTOR_HEADER - -#include "ceed-occa-ceed-object.hpp" - -namespace ceed { -namespace occa { -template -::occa::memory arrayToMemory(const TM *array) { - if (array) { - ::occa::memory mem((::occa::modeMemory_t *)array); - mem.setDtype(::occa::dtype::get()); - return mem; - } - return ::occa::null; -} - -template -TM *memoryToArray(::occa::memory &memory) { - return (TM *)memory.getModeMemory(); -} - -class Vector : public CeedObject { - public: - // Owned resources - CeedSize length; - ::occa::memory memory; - CeedSize hostBufferLength; - CeedScalar *hostBuffer; - - ::occa::kernel setValueKernel; - - // Current resources - ::occa::memory currentMemory; - CeedScalar *currentHostBuffer; - - // State information - int syncState; - - Vector(); - - ~Vector(); - - int hasValidArray(bool *has_valid_array); - - int hasBorrowedArrayOfType(CeedMemType mem_type, bool *has_borrowed_array_of_type); - - static Vector *getVector(CeedVector vec, const bool assertValid = true); - - static Vector *from(CeedVector vec); - - void resize(const CeedSize length_); - - void resizeMemory(const CeedSize length_); - - void resizeMemory(::occa::device device, const CeedSize length_); - - void resizeHostBuffer(const CeedSize length_); - - void setCurrentMemoryIfNeeded(); - - void setCurrentHostBufferIfNeeded(); - - void freeHostBuffer(); - - int setValue(CeedScalar value); - - int setArray(CeedMemType mtype, CeedCopyMode cmode, CeedScalar *array); - - int takeArray(CeedMemType mtype, CeedScalar **array); - - int copyArrayValues(CeedMemType mtype, CeedScalar *array); - - int ownArrayPointer(CeedMemType mtype, CeedScalar *array); - - int useArrayPointer(CeedMemType mtype, CeedScalar *array); - - int getArray(CeedMemType mtype, CeedScalar **array); - - int getReadOnlyArray(CeedMemType mtype, CeedScalar **array); - - int getWriteOnlyArray(CeedMemType mtype, CeedScalar **array); - - int restoreArray(CeedScalar **array); - - int restoreReadOnlyArray(CeedScalar **array); - - ::occa::memory getKernelArg(); - - ::occa::memory getConstKernelArg(); - - void printValues(const std::string &name); - void printNonZeroValues(const std::string &name); - void printSummary(const std::string &name); - - //---[ Ceed Callbacks ]----------- - static int registerCeedFunction(Ceed ceed, CeedVector vec, const char *fname, ceed::occa::ceedFunction f); - - static int ceedHasValidArray(CeedVector vec, bool *has_valid_array); - - static int ceedHasBorrowedArrayOfType(CeedVector vec, CeedMemType mem_type, bool *has_borrowed_array_of_type); - - static int ceedCreate(CeedSize length, CeedVector vec); - - static int ceedSetValue(CeedVector vec, CeedScalar value); - - static int ceedSetArray(CeedVector vec, CeedMemType mtype, CeedCopyMode cmode, CeedScalar *array); - - static int ceedTakeArray(CeedVector vec, CeedMemType mtype, CeedScalar **array); - - static int ceedGetArray(CeedVector vec, CeedMemType mtype, CeedScalar **array); - - static int ceedGetArrayRead(CeedVector vec, CeedMemType mtype, CeedScalar **array); - - static int ceedGetArrayWrite(CeedVector vec, CeedMemType mtype, CeedScalar **array); - - static int ceedRestoreArray(CeedVector vec, CeedScalar **array); - - static int ceedRestoreArrayRead(CeedVector vec, CeedScalar **array); - - static int ceedDestroy(CeedVector vec); -}; -} // namespace occa -} // namespace ceed - -#endif diff --git a/backends/occa/ceed-occa.cpp b/backends/occa/ceed-occa.cpp deleted file mode 100644 index d43231f2a1..0000000000 --- a/backends/occa/ceed-occa.cpp +++ /dev/null @@ -1,329 +0,0 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. -// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. -// -// SPDX-License-Identifier: BSD-2-Clause -// -// This file is part of CEED: http://github.com/ceed - -#warning "libCEED OCCA backend is experimental; for best performance, use device native backends" - -#include -#include -#include - -#include "ceed-occa-context.hpp" -#include "ceed-occa-elem-restriction.hpp" -#include "ceed-occa-operator.hpp" -#include "ceed-occa-qfunction.hpp" -#include "ceed-occa-qfunctioncontext.hpp" -#include "ceed-occa-simplex-basis.hpp" -#include "ceed-occa-tensor-basis.hpp" -#include "ceed-occa-types.hpp" -#include "ceed-occa-vector.hpp" - -namespace ceed { -namespace occa { -typedef std::map StringMap; -typedef std::vector StringVector; - -enum ResourceParserStep { RESOURCE, QUERY_KEY, QUERY_VALUE }; - -static const char RESOURCE_DELIMITER = '/'; -static const char QUERY_DELIMITER = ':'; -static const char QUERY_KEY_VALUE_DELIMITER = '='; -static const char QUERY_ARG_DELIMITER = ','; - -static std::string getDefaultDeviceMode(const bool cpuMode, const bool gpuMode) { - // In case both cpuMode and gpuMode are set, prioritize the GPU if available - // For example, if the resource is "/*/occa" - if (gpuMode) { - if (::occa::modeIsEnabled("CUDA")) { - return "CUDA"; - } - if (::occa::modeIsEnabled("HIP")) { - return "HIP"; - } - if (::occa::modeIsEnabled("dpcpp")) { - return "dpcpp"; - } - if (::occa::modeIsEnabled("OpenCL")) { - return "OpenCL"; - } - // Metal doesn't support doubles - } - - if (cpuMode) { - if (::occa::modeIsEnabled("OpenMP")) { - return "OpenMP"; - } - return "Serial"; - } - - return ""; -} - -static int getDeviceMode(const std::string &match, std::string &mode) { - if (match == "cuda") { - mode = "CUDA"; - return CEED_ERROR_SUCCESS; - } - if (match == "hip") { - mode = "HIP"; - return CEED_ERROR_SUCCESS; - } - if (match == "dpcpp") { - mode = "dpcpp"; - return CEED_ERROR_SUCCESS; - } - if (match == "opencl") { - mode = "OpenCL"; - return CEED_ERROR_SUCCESS; - } - if (match == "openmp") { - mode = "OpenMP"; - return CEED_ERROR_SUCCESS; - } - if (match == "serial") { - mode = "Serial"; - return CEED_ERROR_SUCCESS; - } - - const bool autoMode = match == "*"; - const bool cpuMode = match == "cpu"; - const bool gpuMode = match == "gpu"; - - mode = getDefaultDeviceMode(cpuMode || autoMode, gpuMode || autoMode); - return !mode.size(); -} - -static int splitCeedResource(const std::string &resource, std::string &match, StringMap &query) { - /* - * resource: - * - * "/gpu/occa?mode='CUDA':device_id=0" - * - * resourceVector: - * - * ["gpu", "occa"] - * - * match: - * - * "gpu" - * - * query: - * - * { - * "mode": "'CUDA'", - * "device_id": "0", - * } - */ - const int charCount = (int)resource.size(); - const char *c_resource = resource.c_str(); - - StringVector resourceVector; - - ResourceParserStep parsingStep = RESOURCE; - int wordStart = 1; - std::string queryKey; - - // Check for /gpu/cuda/occa, /gpu/hip/occa, /cpu/self/occa, /cpu/openmp/occa - // Note: added for matching style with other backends - if (resource == "/gpu/cuda/occa") { - match = "cuda"; - return CEED_ERROR_SUCCESS; - } - if (resource == "/gpu/hip/occa") { - match = "hip"; - return CEED_ERROR_SUCCESS; - } - if (resource == "/gpu/dpcpp/occa") { - match = "dpcpp"; - return CEED_ERROR_SUCCESS; - } - if (resource == "/gpu/opencl/occa") { - match = "opencl"; - return CEED_ERROR_SUCCESS; - } - if (resource == "/cpu/openmp/occa") { - match = "openmp"; - return CEED_ERROR_SUCCESS; - } - if (resource == "/cpu/self/occa") { - match = "serial"; - return CEED_ERROR_SUCCESS; - } - - // Skip initial slash - for (int i = 1; i <= charCount; ++i) { - const char c = c_resource[i]; - - if (parsingStep == RESOURCE) { - if (c == RESOURCE_DELIMITER || c == QUERY_DELIMITER || c == '\0') { - resourceVector.push_back(resource.substr(wordStart, i - wordStart)); - wordStart = i + 1; - - // Check if we are done parsing the resource - if (c == QUERY_DELIMITER) { - parsingStep = QUERY_KEY; - } - } - } else if (parsingStep == QUERY_KEY) { - if (c == QUERY_KEY_VALUE_DELIMITER) { - queryKey = resource.substr(wordStart, i - wordStart); - wordStart = i + 1; - - // Looking to parse the query value now - parsingStep = QUERY_VALUE; - } - } else if (parsingStep == QUERY_VALUE) { - if (c == QUERY_ARG_DELIMITER || c == '\0') { - query[queryKey] = resource.substr(wordStart, i - wordStart); - wordStart = i + 1; - - // Back to parsing the next query argument - parsingStep = QUERY_KEY; - queryKey = ""; - } - } - } - - // Looking for [match, "occa"] - if (resourceVector.size() != 2 || resourceVector[1] != "occa") { - return 1; - } - - match = resourceVector[0]; - return CEED_ERROR_SUCCESS; -} - -void setDefaultProps(::occa::properties &deviceProps, const std::string &defaultMode) { - std::string mode; - if (deviceProps.has("mode")) { - // Don't override mode if passed - mode = (std::string)deviceProps["mode"]; - } else { - mode = defaultMode; - deviceProps.set("mode", mode); - } - - // Set default device id - if ((mode == "CUDA") || (mode == "HIP") || (mode == "dpcpp") || (mode == "OpenCL")) { - if (!deviceProps.has("device_id")) { - deviceProps["device_id"] = 0; - } - } - - // Set default platform id - if ((mode == "dpcpp") || (mode == "OpenCL")) { - if (!deviceProps.has("platform_id")) { - deviceProps["platform_id"] = 0; - } - } -} - -static int initCeed(const char *c_resource, Ceed ceed) { - int ierr; - std::string match; - StringMap query; - - ierr = splitCeedResource(c_resource, match, query); - if (ierr) { - return CeedError(ceed, CEED_ERROR_BACKEND, "(OCCA) Backend cannot use resource: %s", c_resource); - } - - std::string mode; - ierr = getDeviceMode(match, mode); - if (ierr) { - return CeedError(ceed, CEED_ERROR_BACKEND, "(OCCA) Backend cannot use resource: %s", c_resource); - } - - std::string devicePropsStr = "{\n"; - StringMap::const_iterator it; - for (it = query.begin(); it != query.end(); ++it) { - devicePropsStr += " \""; - devicePropsStr += it->first; - devicePropsStr += "\": "; - devicePropsStr += it->second; - devicePropsStr += ",\n"; - } - devicePropsStr += '}'; - - ::occa::properties deviceProps(devicePropsStr); - setDefaultProps(deviceProps, mode); - - ceed::occa::Context *context = new Context(::occa::device(deviceProps)); - CeedCallBackend(CeedSetData(ceed, context)); - - return CEED_ERROR_SUCCESS; -} - -static int destroyCeed(Ceed ceed) { - delete Context::from(ceed); - return CEED_ERROR_SUCCESS; -} - -static int registerCeedFunction(Ceed ceed, const char *fname, ceed::occa::ceedFunction f) { - return CeedSetBackendFunction(ceed, "Ceed", ceed, fname, f); -} - -static int preferHostMemType(CeedMemType *type) { - *type = CEED_MEM_HOST; - return CEED_ERROR_SUCCESS; -} - -static int preferDeviceMemType(CeedMemType *type) { - *type = CEED_MEM_DEVICE; - return CEED_ERROR_SUCCESS; -} - -static ceed::occa::ceedFunction getPreferredMemType(Ceed ceed) { - if (Context::from(ceed)->device.hasSeparateMemorySpace()) { - return (ceed::occa::ceedFunction)(void *)preferDeviceMemType; - } - return (ceed::occa::ceedFunction)(void *)preferHostMemType; -} - -static int registerMethods(Ceed ceed) { - CeedOccaRegisterBaseFunction("Destroy", ceed::occa::destroyCeed); - CeedOccaRegisterBaseFunction("GetPreferredMemType", getPreferredMemType(ceed)); - CeedOccaRegisterBaseFunction("VectorCreate", ceed::occa::Vector::ceedCreate); - CeedOccaRegisterBaseFunction("BasisCreateTensorH1", ceed::occa::TensorBasis::ceedCreate); - CeedOccaRegisterBaseFunction("BasisCreateH1", ceed::occa::SimplexBasis::ceedCreate); - CeedOccaRegisterBaseFunction("ElemRestrictionCreate", ceed::occa::ElemRestriction::ceedCreate); - CeedOccaRegisterBaseFunction("QFunctionCreate", ceed::occa::QFunction::ceedCreate); - CeedOccaRegisterBaseFunction("QFunctionContextCreate", ceed::occa::QFunctionContext::ceedCreate); - CeedOccaRegisterBaseFunction("OperatorCreate", ceed::occa::Operator::ceedCreate); - CeedOccaRegisterBaseFunction("CompositeOperatorCreate", ceed::occa::Operator::ceedCreateComposite); - - return CEED_ERROR_SUCCESS; -} - -static int registerBackend(const char *resource, Ceed ceed) { - try { - CeedCallBackend(ceed::occa::initCeed(resource, ceed)); - } catch (const ::occa::exception &e) { - CeedHandleOccaException(e); - } - try { - CeedCallBackend(ceed::occa::registerMethods(ceed)); - } catch (const ::occa::exception &e) { - CeedHandleOccaException(e); - } - return CEED_ERROR_SUCCESS; -} -} // namespace occa -} // namespace ceed - -CEED_INTERN int CeedRegister_Occa(void) { - // General mode - CeedCallBackend(CeedRegister("/*/occa", ceed::occa::registerBackend, 270)); - // CPU Modes - CeedCallBackend(CeedRegister("/cpu/self/occa", ceed::occa::registerBackend, 260)); - CeedCallBackend(CeedRegister("/cpu/openmp/occa", ceed::occa::registerBackend, 250)); - // GPU Modes - CeedCallBackend(CeedRegister("/gpu/dpcpp/occa", ceed::occa::registerBackend, 240)); - CeedCallBackend(CeedRegister("/gpu/opencl/occa", ceed::occa::registerBackend, 230)); - CeedCallBackend(CeedRegister("/gpu/hip/occa", ceed::occa::registerBackend, 220)); - CeedCallBackend(CeedRegister("/gpu/cuda/occa", ceed::occa::registerBackend, 210)); - return CEED_ERROR_SUCCESS; -} diff --git a/backends/occa/ceed-occa.h b/backends/occa/ceed-occa.h deleted file mode 100644 index d9dc78ebd4..0000000000 --- a/backends/occa/ceed-occa.h +++ /dev/null @@ -1,149 +0,0 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. -// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. -// -// SPDX-License-Identifier: BSD-2-Clause -// -// This file is part of CEED: http://github.com/ceed - -#include -#include -#include -#include -#include -#include - -// ***************************************************************************** -#define OCCA_PATH_MAX 4096 - -// ***************************************************************************** -// used to get Dl_info struct declaration (vs _GNU_SOURCE?) -#ifndef __USE_GNU -#define __USE_GNU -#endif -#include - -// ***************************************************************************** -#include "occa.h" - -// ***************************************************************************** -#define NO_OFFSET 0 -#define TILE_SIZE 32 -#define NO_PROPS occaDefault - -// ***************************************************************************** -// * CeedVector Occa struct -// ***************************************************************************** -typedef struct { - CeedScalar *h_array; - CeedScalar *h_array_allocated; - occaMemory d_array; -} CeedVector_Occa; - -// ***************************************************************************** -// * CeedElemRestriction Occa struct -// ***************************************************************************** -#define CEED_OCCA_NUM_RESTRICTION_KERNELS 8 -typedef struct { - bool strided; - occaMemory d_indices; - occaMemory d_toffsets; - occaMemory d_tindices; - occaKernel kRestrict[CEED_OCCA_NUM_RESTRICTION_KERNELS]; -} CeedElemRestriction_Occa; - -// ***************************************************************************** -// * CeedBasis Occa struct -// ***************************************************************************** -typedef struct { - bool ready; - CeedElemRestriction er; - occaMemory qref1d; - occaMemory qweight1d; - occaMemory interp1d; - occaMemory grad1d; - occaMemory tmp0, tmp1; - occaKernel kZero, kInterp, kGrad, kWeight; -} CeedBasis_Occa; - -// ***************************************************************************** -// * CeedOperator Occa struct -// ***************************************************************************** -typedef struct { - CeedVector *Evecs; /// E-vectors needed to apply operator (in followed by out) - CeedScalar **Edata; - CeedVector *evecsin; /// Input E-vectors needed to apply operator - CeedVector *evecsout; /// Output E-vectors needed to apply operator - CeedVector *qvecsin; /// Input Q-vectors needed to apply operator - CeedVector *qvecsout; /// Output Q-vectors needed to apply operator - CeedInt numein; - CeedInt numeout; -} CeedOperator_Occa; - -// ***************************************************************************** -// * CeedQFunction Occa struct -// ***************************************************************************** -#define N_MAX_IDX 16 -typedef struct { - bool ready; - CeedInt idx, odx; - CeedInt iOf7[N_MAX_IDX]; - CeedInt oOf7[N_MAX_IDX]; - int nc, dim, nelem, elemsize, e; - occaMemory o_indata, o_outdata; - occaMemory d_ctx, d_idx, d_odx; - char *oklPath; - const char *qFunctionName; - occaKernel kQFunctionApply; - CeedOperator op; -} CeedQFunction_Occa; - -// ***************************************************************************** -// * CeedQFunctionContext Occa struct -// ***************************************************************************** -typedef struct { - CeedScalar *h_data; - CeedScalar *h_data_allocated; -} CeedQFunctionContext_Occa; - -// ***************************************************************************** -// * Ceed Occa struct -// ***************************************************************************** -typedef struct { - occaDevice device; - bool ocl; - char *libceed_dir; - char *occa_cache_dir; -} Ceed_Occa; - -// ***************************************************************************** -CEED_INTERN int CeedOklPath_Occa(const Ceed, const char *, const char *, char **); - -// ***************************************************************************** -CEED_INTERN int CeedOklDladdr_Occa(Ceed); - -// ***************************************************************************** -CEED_INTERN int CeedBasisCreateTensorH1_Occa(CeedInt dim, CeedInt P1d, CeedInt Q1d, const CeedScalar *interp1d, const CeedScalar *grad1d, - const CeedScalar *qref1d, const CeedScalar *qweight1d, CeedBasis basis); - -// ***************************************************************************** -CEED_INTERN int CeedBasisCreateH1_Occa(CeedElemTopology topo, CeedInt dim, CeedInt ndof, CeedInt nqpts, const CeedScalar *interp1d, - const CeedScalar *grad1d, const CeedScalar *qref1d, const CeedScalar *qweight1d, CeedBasis basis); - -// ***************************************************************************** -CEED_INTERN int CeedBasisApplyElems_Occa(CeedBasis basis, CeedInt Q, CeedTransposeMode tmode, CeedEvalMode emode, const CeedVector u, CeedVector v); - -// ***************************************************************************** -CEED_INTERN int CeedOperatorCreate_Occa(CeedOperator op); - -// ***************************************************************************** -CEED_INTERN int CeedQFunctionCreate_Occa(CeedQFunction qf); - -// ***************************************************************************** -CEED_INTERN int CeedQFunctionContextCreate_Occa(CeedQFunctionContext ctx); - -// ***************************************************************************** -CEED_INTERN int CeedElemRestrictionCreate_Occa(const CeedMemType mtype, const CeedCopyMode cmode, const CeedInt *indices, const bool *orients, - const CeedInt8 *curl_orients, const CeedElemRestriction res); - -// ***************************************************************************** -CEED_INTERN int CeedVectorCreate_Occa(CeedInt n, CeedVector vec); diff --git a/backends/occa/kernels/elem-restriction.cpp b/backends/occa/kernels/elem-restriction.cpp deleted file mode 100644 index 947556be1f..0000000000 --- a/backends/occa/kernels/elem-restriction.cpp +++ /dev/null @@ -1,124 +0,0 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. -// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. -// -// SPDX-License-Identifier: BSD-2-Clause -// -// This file is part of CEED: http://github.com/ceed - -#include "./kernel-defines.hpp" - -// Kernels are based on the cuda backend from LLNL and VT groups -// -// Expects the following types to be defined: -// - CeedInt -// - CeedScalar -// -// Expects the following constants to be defined: -// - COMPONENT_COUNT : CeedInt -// - ELEMENT_SIZE : CeedInt -// - NODE_COUNT : CeedInt -// - TILE_SIZE : int -// - USES_INDICES : bool -// - STRIDE_TYPE : ceed::occa::StrideType -// - NODE_STRIDE : Optional[CeedInt] -// - COMPONENT_STRIDE : Optional[CeedInt] -// - ELEMENT_STRIDE : Optional[CeedInt] -// - UNSTRIDED_COMPONENT_STRIDE : Optional[CeedInt] - -const char *occa_elem_restriction_source = STRINGIFY_SOURCE( - - @directive("#define PRINT_KERNEL_HASHES 0") - - typedef CeedScalar * - QuadVector @dim(ELEMENT_SIZE, COMPONENT_COUNT, elementCount); - - @kernel void applyRestriction(const CeedInt elementCount, const CeedInt *indices, CeedScalar *u, QuadVector v) { - @tile(TILE_SIZE, @outer, @inner) for (int element = 0; element < elementCount; ++element) { - @directive("#if PRINT_KERNEL_HASHES") - // Print to see which kernel is being run - if (element == 0) { - printf("\n\napplyRestriction Kernel: " OKL_KERNEL_HASH "\n\n"); - } - @directive("#endif") - - @directive("#if USES_INDICES") for (int node = 0; node < ELEMENT_SIZE; ++node) { - const CeedInt index = indices[node + (element * ELEMENT_SIZE)]; - - for (int c = 0; c < COMPONENT_COUNT; ++c) { - v(node, c, element) = u[index + (c * UNSTRIDED_COMPONENT_STRIDE)]; - } - } - @directive("#else") for (int node = 0; node < ELEMENT_SIZE; ++node) { - for (int c = 0; c < COMPONENT_COUNT; ++c) { - v(node, c, element) = u[(node * NODE_STRIDE) + (c * COMPONENT_STRIDE) + (element * ELEMENT_STRIDE)]; - } - } - @directive("#endif") - } - } - - @directive("#if USES_INDICES") - - @kernel void applyRestrictionTranspose(const CeedInt elementCount, const CeedInt *quadIndices, const CeedInt *dofOffsets, - const CeedInt *dofIndices, const QuadVector u, CeedScalar *v) { - @tile(TILE_SIZE, @outer, @inner) for (int n = 0; n < NODE_COUNT; ++n) { - @directive("#if PRINT_KERNEL_HASHES") - // Print to see which kernel is being run - if (n == 0) { - printf("\n\napplyRestrictionTranspose Kernel: " OKL_KERNEL_HASH "\n\n"); - } - @directive("#endif") - - CeedScalar vComp[COMPONENT_COUNT]; - - // Prefetch index information - const CeedInt vIndex = quadIndices[n]; - const CeedInt offsetStart = dofOffsets[n]; - const CeedInt offsetEnd = dofOffsets[n + 1]; - - for (int c = 0; c < COMPONENT_COUNT; ++c) { - vComp[c] = 0; - } - - // Aggregate by component - for (CeedInt i = offsetStart; i < offsetEnd; ++i) { - const CeedInt index = dofIndices[i]; - - const int node = (index % ELEMENT_SIZE); - const int element = (index / ELEMENT_SIZE); - - for (int c = 0; c < COMPONENT_COUNT; ++c) { - vComp[c] += u(node, c, element); - } - } - - // Update dofs by component - for (int c = 0; c < COMPONENT_COUNT; ++c) { - v[vIndex + (c * UNSTRIDED_COMPONENT_STRIDE)] += vComp[c]; - } - } - } - - @directive("#else") // USES_INDICES = false - - @kernel void applyRestrictionTranspose(const CeedInt elementCount, const CeedInt *quadIndices, const CeedInt *dofOffsets, - const CeedInt *dofIndices, const QuadVector u, CeedScalar *v) { - @tile(TILE_SIZE, @outer, @inner) for (int element = 0; element < elementCount; ++element) { - @directive("#if PRINT_KERNEL_HASHES") - // Print to see which kernel is being run - if (element == 0) { - printf("\n\napplyRestrictionTranspose Kernel: " OKL_KERNEL_HASH "\n\n"); - } - @directive("#endif") - - for (int node = 0; node < ELEMENT_SIZE; ++node) { - for (int c = 0; c < COMPONENT_COUNT; ++c) { - v[(node * NODE_STRIDE) + (c * COMPONENT_STRIDE) + (element * ELEMENT_STRIDE)] += u(node, c, element); - } - } - } - } - - @directive("#endif") // USES_INDICES - -); diff --git a/backends/occa/kernels/elem-restriction.hpp b/backends/occa/kernels/elem-restriction.hpp deleted file mode 100644 index ac45de6c49..0000000000 --- a/backends/occa/kernels/elem-restriction.hpp +++ /dev/null @@ -1,31 +0,0 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. -// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. -// -// SPDX-License-Identifier: BSD-2-Clause -// -// This file is part of CEED: http://github.com/ceed - -#ifndef CEED_OCCA_KERNELS_ELEMRESTRICTION_HEADER -#define CEED_OCCA_KERNELS_ELEMRESTRICTION_HEADER - -// Kernels are based on the cuda backend from LLNL and VT groups -// -// Expects the following types to be defined: -// - CeedInt -// - CeedScalar -// -// Expects the following constants to be defined: -// - COMPONENT_COUNT : CeedInt -// - ELEMENT_SIZE : CeedInt -// - NODE_COUNT : CeedInt -// - TILE_SIZE : int -// - USES_INDICES : bool -// - STRIDE_TYPE : ceed::occa::StrideType -// - NODE_STRIDE : Optional[CeedInt] -// - COMPONENT_STRIDE : Optional[CeedInt] -// - ELEMENT_STRIDE : Optional[CeedInt] -// - UNSTRIDED_COMPONENT_STRIDE : Optional[CeedInt] - -extern const char *occa_elem_restriction_source; - -#endif diff --git a/backends/occa/kernels/set-value.cpp b/backends/occa/kernels/set-value.cpp deleted file mode 100644 index a7a756e442..0000000000 --- a/backends/occa/kernels/set-value.cpp +++ /dev/null @@ -1,23 +0,0 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. -// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. -// -// SPDX-License-Identifier: BSD-2-Clause -// -// This file is part of CEED: http://github.com/ceed - -#include "./kernel-defines.hpp" - -// Expects the following types to be defined: -// - CeedInt -// - CeedScalar -// -// Expects the following constants to be defined: -// - BLOCK_SIZE : CeedInt - -const char *occa_set_value_source = STRINGIFY_SOURCE( - - @kernel void setValue(CeedScalar *ptr, const CeedScalar value, const CeedInt count) { - @tile(BLOCK_SIZE, @outer, @inner) for (CeedInt i = 0; i < count; ++i) { - ptr[i] = value; - } - }); diff --git a/backends/occa/kernels/set-value.hpp b/backends/occa/kernels/set-value.hpp deleted file mode 100644 index fa5303f5f0..0000000000 --- a/backends/occa/kernels/set-value.hpp +++ /dev/null @@ -1,20 +0,0 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. -// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. -// -// SPDX-License-Identifier: BSD-2-Clause -// -// This file is part of CEED: http://github.com/ceed - -#ifndef CEED_OCCA_KERNELS_SETVALUE_HEADER -#define CEED_OCCA_KERNELS_SETVALUE_HEADER - -// Expects the following types to be defined: -// - CeedInt -// - CeedScalar -// -// Expects the following constants to be defined: -// - BLOCK_SIZE : CeedInt - -extern const char *occa_set_value_source; - -#endif \ No newline at end of file diff --git a/backends/occa/kernels/simplex-basis.hpp b/backends/occa/kernels/simplex-basis.hpp deleted file mode 100644 index 4f53e5c6dd..0000000000 --- a/backends/occa/kernels/simplex-basis.hpp +++ /dev/null @@ -1,31 +0,0 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. -// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. -// -// SPDX-License-Identifier: BSD-2-Clause -// -// This file is part of CEED: http://github.com/ceed - -#ifndef CEED_OCCA_KERNELS_SIMPLEXBASIS_HEADER -#define CEED_OCCA_KERNELS_SIMPLEXBASIS_HEADER - -// Kernels are based on the cuda backend from LLNL and VT groups -// -// Expects the following types to be defined: -// - CeedInt -// - CeedScalar -// -// Expects the following constants to be defined: -// - DIM : CeedInt -// - Q : CeedInt -// - P : CeedInt -// - MAX_PQ : CeedInt -// - BASIS_COMPONENT_COUNT: CeedInt -// - ELEMENTS_PER_BLOCK : CeedInt -// - TRANSPOSE : bool - -extern const char *occa_simplex_basis_cpu_function_source; -extern const char *occa_simplex_basis_cpu_kernel_source; - -extern const char *occa_simplex_basis_gpu_source; - -#endif diff --git a/backends/occa/kernels/simplex-basis/cpu-simplex-basis.cpp b/backends/occa/kernels/simplex-basis/cpu-simplex-basis.cpp deleted file mode 100644 index 39a36684c2..0000000000 --- a/backends/occa/kernels/simplex-basis/cpu-simplex-basis.cpp +++ /dev/null @@ -1,138 +0,0 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. -// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. -// -// SPDX-License-Identifier: BSD-2-Clause -// -// This file is part of CEED: http://github.com/ceed - -#include "../kernel-defines.hpp" - -const char *occa_simplex_basis_cpu_function_source = STRINGIFY_SOURCE( - - @directive("#define SIMPLEX_FUNCTION(FUNCTION_NAME) simplex_ ## DIM ## d_ ## FUNCTION_NAME ## _Q ## Q ## _P ## P") - - inline void SIMPLEX_FUNCTION(interpElement)(const CeedScalar *B @dim(P, Q), const CeedScalar *Ue, CeedScalar *Ve) { - for (int q = 0; q < Q; ++q) { - CeedScalar v = 0; - for (int p = 0; p < P; ++p) { - v += B(p, q) * Ue[p]; - } - Ve[q] = v; - } - } - - inline void SIMPLEX_FUNCTION(interpElementTranspose)(const CeedScalar *B @dim(P, Q), const CeedScalar *Ue, CeedScalar *Ve) { - for (int p = 0; p < P; ++p) { - CeedScalar v = 0; - for (int q = 0; q < Q; ++q) { - v += B(p, q) * Ue[q]; - } - Ve[p] = v; - } - } - - inline void SIMPLEX_FUNCTION(gradElement)(const CeedScalar *Bx @dim(P, Q, DIM), const CeedScalar *Ue, CeedScalar *Ve, ) { - for (int q = 0; q < Q; ++q) { - CeedScalar v[DIM]; - for (int dim = 0; dim < DIM; ++dim) { - v[dim] = 0; - } - - for (int p = 0; p < P; ++p) { - const CeedScalar u = Ue[p]; - for (int dim = 0; dim < DIM; ++dim) { - v[dim] += Bx(p, q, dim) * u; - } - } - - for (int dim = 0; dim < DIM; ++dim) { - Ve[dim * Q + q] = v[dim]; - } - } - } - - inline void SIMPLEX_FUNCTION(gradElementTranspose)(const CeedScalar *Bx @dim(P, Q, DIM), const CeedScalar *Ue, CeedScalar *Ve) { - for (int p = 0; p < P; ++p) { - CeedScalar v = 0; - for (int dim = 0; dim < DIM; ++dim) { - for (int q = 0; q < Q; ++q) { - v += Bx(p, q, dim) * Ue[dim * Q + q]; - } - } - Ve[p] = v; - } - } - - inline void SIMPLEX_FUNCTION(weightElement)(const CeedScalar *qWeights, CeedScalar *We) { - for (int q = 0; q < Q; ++q) { - We[q] = qWeights[q]; - } - } - -); - -const char *occa_simplex_basis_cpu_kernel_source = STRINGIFY_SOURCE( - - @kernel void interp(const CeedInt elementCount, const CeedScalar *B, const CeedScalar *U, CeedScalar *V) { - for (int element = 0; element < elementCount; ++element; @outer) { - for (int component = 0; component < BASIS_COMPONENT_COUNT; ++component; @inner) { - if (!TRANSPOSE) { - const CeedScalar *Ue @dim(P, BASIS_COMPONENT_COUNT, elementCount) = U; - CeedScalar *Ve @dim(Q, elementCount, BASIS_COMPONENT_COUNT) = V; - - SIMPLEX_FUNCTION(interpElement)(B, &Ue(0, component, element), &Ve(0, element, component)); - } else { - const CeedScalar *Ue @dim(Q, elementCount, BASIS_COMPONENT_COUNT) = U; - CeedScalar *Ve @dim(P, BASIS_COMPONENT_COUNT, elementCount) = V; - - SIMPLEX_FUNCTION(interpElementTranspose)(B, &Ue(0, element, component), &Ve(0, component, element)); - } - } - } - } - - @kernel void grad(const CeedInt elementCount, const CeedScalar *Bx, const CeedScalar *U, CeedScalar *V) { - for (int element = 0; element < elementCount; ++element; @outer) { - for (int component = 0; component < BASIS_COMPONENT_COUNT; ++component; @inner) { - if (!TRANSPOSE) { - const CeedScalar *Ue @dim(P, BASIS_COMPONENT_COUNT, elementCount) = U; - CeedScalar *_Ve @dim(Q, elementCount, BASIS_COMPONENT_COUNT, DIM) = V; - - CeedScalar Ve[DIM][Q]; - for (int dim = 0; dim < DIM; ++dim) { - for (int q = 0; q < Q; ++q) { - Ve[dim][q] = _Ve(q, element, component, dim); - } - } - - SIMPLEX_FUNCTION(gradElement)(Bx, &Ue(0, component, element), (CeedScalar *)Ve); - - for (int dim = 0; dim < DIM; ++dim) { - for (int q = 0; q < Q; ++q) { - _Ve(q, element, component, dim) = Ve[dim][q]; - } - } - } else { - const CeedScalar *_Ue @dim(Q, elementCount, BASIS_COMPONENT_COUNT, DIM) = U; - CeedScalar *Ve @dim(P, BASIS_COMPONENT_COUNT, elementCount) = V; - - CeedScalar Ue[DIM][Q]; - for (int dim = 0; dim < DIM; ++dim) { - for (int q = 0; q < Q; ++q) { - Ue[dim][q] = _Ue(q, element, component, dim); - } - } - - SIMPLEX_FUNCTION(gradElementTranspose)(Bx, (CeedScalar *)Ue, &Ve(0, component, element)); - } - } - } - } - - @kernel void weight(const CeedInt elementCount, const CeedScalar *qWeights, CeedScalar *W @dim(Q, elementCount)) { - @tile(32, @outer, @inner) for (int element = 0; element < elementCount; ++element) { - SIMPLEX_FUNCTION(weightElement)(qWeights, &W(0, element)); - } - } - -); diff --git a/backends/occa/kernels/simplex-basis/gpu-simplex-basis.cpp b/backends/occa/kernels/simplex-basis/gpu-simplex-basis.cpp deleted file mode 100644 index aa09fa60d7..0000000000 --- a/backends/occa/kernels/simplex-basis/gpu-simplex-basis.cpp +++ /dev/null @@ -1,138 +0,0 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. -// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. -// -// SPDX-License-Identifier: BSD-2-Clause -// -// This file is part of CEED: http://github.com/ceed - -#include "../kernel-defines.hpp" - -const char *occa_simplex_basis_gpu_source = STRINGIFY_SOURCE( - - @directive("#if TRANSPOSE") typedef CeedScalar * dofArray @dim(Q, elementCount, BASIS_COMPONENT_COUNT, DIM); - typedef CeedScalar * quadArray @dim(P, BASIS_COMPONENT_COUNT, elementCount, DIM); - @directive("#else") typedef CeedScalar * dofArray @dim(P, BASIS_COMPONENT_COUNT, elementCount, DIM); - typedef CeedScalar * quadArray @dim(Q, elementCount, BASIS_COMPONENT_COUNT, DIM); @directive("#endif") - - typedef CeedScalar * - quadToDof @dim(P, Q); - typedef CeedScalar * dQuadToDof @dim(P, Q, DIM); typedef CeedScalar * elementWeightArray @dim(Q, elementCount); - - @kernel void interp(const CeedInt elementCount, const quadToDof B, const dofArray U, quadArray V) { - for (int elementOffset = 0; elementOffset < elementCount; elementOffset += ELEMENTS_PER_BLOCK; @outer) { - @shared CeedScalar s_B[P * Q] @dim(P, Q); - - // Store weights in shared memory - for (int i = 0; i < MAX_PQ; ++i; @inner) { - for (int j = i; j < (P * Q); j += MAX_PQ) { - s_B[j] = B[j]; - } - } - - for (int localElement = 0; localElement < ELEMENTS_PER_BLOCK; ++localElement) { - for (int i = 0; i < MAX_PQ; ++i; @inner) { - const int element = elementOffset + localElement; - if (element < elementCount) { - // Element operation - for (int component = 0; component < BASIS_COMPONENT_COUNT; ++component) { - if (!TRANSPOSE) { - const int q = i; - if (q < Q) { - CeedScalar v = 0; - for (int p = 0; p < P; ++p) { - v += s_B(p, q) * U(p, component, element, 0); - } - V(q, element, component, 0) = v; - } - } else { - const int p = i; - if (p < P) { - CeedScalar v = 0; - for (int q = 0; q < Q; ++q) { - v += s_B(p, q) * U(q, element, component, 0); - } - V(p, component, element, 0) = v; - } - } - } - } - } - } - } - } - - @kernel void grad(const CeedInt elementCount, const dQuadToDof Bx, const dofArray U, quadArray V) { - for (int elementOffset = 0; elementOffset < elementCount; elementOffset += ELEMENTS_PER_BLOCK; @outer) { - @shared CeedScalar s_Bx[Q * P * DIM] @dim(P, Q, DIM); - - // Store weights in shared memory - for (int i = 0; i < MAX_PQ; ++i; @inner) { - for (int j = i; j < (P * Q * DIM); j += MAX_PQ) { - s_Bx[j] = Bx[j]; - } - } - - for (int localElement = 0; localElement < ELEMENTS_PER_BLOCK; ++localElement) { - for (int i = 0; i < MAX_PQ; ++i; @inner) { - const int element = elementOffset + localElement; - if (element < elementCount) { - // Element operation - for (int component = 0; component < BASIS_COMPONENT_COUNT; ++component) { - if (!TRANSPOSE) { - const int q = i; - if (q < Q) { - CeedScalar v[DIM]; - for (int dim = 0; dim < DIM; ++dim) { - v[dim] = 0; - } - - for (int p = 0; p < P; ++p) { - const CeedScalar u = U(p, component, element, 0); - for (int dim = 0; dim < DIM; ++dim) { - v[dim] += s_Bx(p, q, dim) * u; - } - } - - for (int dim = 0; dim < DIM; ++dim) { - V(q, element, component, dim) = v[dim]; - } - } - } else { - const int p = i; - if (p < P) { - CeedScalar v = 0; - for (int dim = 0; dim < DIM; ++dim) { - for (int q = 0; q < Q; ++q) { - v += s_Bx(p, q, dim) * U(q, element, component, dim); - } - } - V(p, component, element, 0) = v; - } - } - } - } - } - } - } - } - - @kernel void weight(const CeedInt elementCount, const CeedScalar *qWeights, elementWeightArray W) { - for (int elementOffset = 0; elementOffset < elementCount; elementOffset += ELEMENTS_PER_BLOCK; @outer) { - @shared CeedScalar s_qWeights[Q]; - - for (int q = 0; q < Q; ++q; @inner) { - s_qWeights[q] = qWeights[q]; - } - - for (int localElement = 0; localElement < ELEMENTS_PER_BLOCK; ++localElement) { - const int element = elementOffset + localElement; - if (element < elementCount) { - for (int q = 0; q < Q; ++q; @inner) { - W(q, element) = s_qWeights[q]; - } - } - } - } - } - -); diff --git a/backends/occa/kernels/tensor-basis.hpp b/backends/occa/kernels/tensor-basis.hpp deleted file mode 100644 index 54850a7830..0000000000 --- a/backends/occa/kernels/tensor-basis.hpp +++ /dev/null @@ -1,38 +0,0 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. -// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. -// -// SPDX-License-Identifier: BSD-2-Clause -// -// This file is part of CEED: http://github.com/ceed - -#ifndef CEED_OCCA_KERNELS_TENSORBASIS_HEADER -#define CEED_OCCA_KERNELS_TENSORBASIS_HEADER - -// Kernels are based on the cuda backend from LLNL and VT groups -// -// Expects the following types to be defined: -// - CeedInt -// - CeedScalar -// -// Expects the following constants to be defined: -// - Q1D : CeedInt -// - P1D : CeedInt -// - BASIS_COMPONENT_COUNT: CeedInt -// - ELEMENTS_PER_BLOCK : CeedInt -// - SHARED_BUFFER_SIZE : CeedInt -// - TRANSPOSE : bool - -extern const char *occa_tensor_basis_1d_cpu_function_source; -extern const char *occa_tensor_basis_1d_cpu_kernel_source; - -extern const char *occa_tensor_basis_2d_cpu_function_source; -extern const char *occa_tensor_basis_2d_cpu_kernel_source; - -extern const char *occa_tensor_basis_3d_cpu_function_source; -extern const char *occa_tensor_basis_3d_cpu_kernel_source; - -extern const char *occa_tensor_basis_1d_gpu_source; -extern const char *occa_tensor_basis_2d_gpu_source; -extern const char *occa_tensor_basis_3d_gpu_source; - -#endif diff --git a/backends/occa/kernels/tensor-basis/cpu/tensor-basis-1d.cpp b/backends/occa/kernels/tensor-basis/cpu/tensor-basis-1d.cpp deleted file mode 100644 index 90c9e905d5..0000000000 --- a/backends/occa/kernels/tensor-basis/cpu/tensor-basis-1d.cpp +++ /dev/null @@ -1,108 +0,0 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. -// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. -// -// SPDX-License-Identifier: BSD-2-Clause -// -// This file is part of CEED: http://github.com/ceed - -#include "../../kernel-defines.hpp" - -const char *occa_tensor_basis_1d_cpu_function_source = STRINGIFY_SOURCE( - - @directive("#define TENSOR_FUNCTION(FUNCTION_NAME) tensor_1d_ ## FUNCTION_NAME ## _Q ## Q1D ## _P ## P1D") - - inline void TENSOR_FUNCTION(interpElement)(const CeedScalar *B @dim(P1D, Q1D), const CeedScalar *Ue, CeedScalar *Ve) { - for (int q = 0; q < Q1D; ++q) { - CeedScalar Vq = 0; - for (int p = 0; p < P1D; ++p) { - Vq += B(p, q) * Ue[p]; - } - Ve[q] = Vq; - } - } - - inline void TENSOR_FUNCTION(interpElementTranspose)(const CeedScalar *B @dim(P1D, Q1D), const CeedScalar *Ue, CeedScalar *Ve) { - for (int p = 0; p < P1D; ++p) { - CeedScalar Vp = 0; - for (int q = 0; q < Q1D; ++q) { - Vp += B(p, q) * Ue[q]; - } - Ve[p] = Vp; - } - } - - inline void TENSOR_FUNCTION(gradElement)(const CeedScalar *B @dim(P1D, Q1D), const CeedScalar *Bx @dim(P1D, Q1D), const CeedScalar *Ue, - CeedScalar *Ve) { - for (int q = 0; q < Q1D; ++q) { - CeedScalar Vq = 0; - for (int p = 0; p < P1D; ++p) { - Vq += Bx(p, q) * Ue[p]; - } - Ve[q] = Vq; - } - } - - inline void TENSOR_FUNCTION(gradElementTranspose)(const CeedScalar *B @dim(P1D, Q1D), const CeedScalar *Bx @dim(P1D, Q1D), const CeedScalar *Ue, - CeedScalar *Ve) { - for (int p = 0; p < P1D; ++p) { - CeedScalar Vp = 0; - for (int q = 0; q < Q1D; ++q) { - Vp += Bx(p, q) * Ue[q]; - } - Ve[p] = Vp; - } - } - - inline void TENSOR_FUNCTION(weightElement)(const CeedScalar *qWeights1D, CeedScalar *We) { - for (int q = 0; q < Q1D; ++q) { - We[q] = qWeights1D[q]; - } - } - -); - -const char *occa_tensor_basis_1d_cpu_kernel_source = STRINGIFY_SOURCE( - - @kernel void interp(const CeedInt elementCount, const CeedScalar *B, const CeedScalar *U, CeedScalar *V) { - for (int element = 0; element < elementCount; ++element; @outer) { - for (int component = 0; component < BASIS_COMPONENT_COUNT; ++component; @inner) { - if (!TRANSPOSE) { - const CeedScalar *Ue @dim(P1D, BASIS_COMPONENT_COUNT, elementCount) = U; - CeedScalar *Ve @dim(Q1D, elementCount, BASIS_COMPONENT_COUNT) = V; - - TENSOR_FUNCTION(interpElement)(B, &Ue(0, component, element), &Ve(0, element, component)); - } else { - const CeedScalar *Ue @dim(Q1D, elementCount, BASIS_COMPONENT_COUNT) = U; - CeedScalar *Ve @dim(P1D, BASIS_COMPONENT_COUNT, elementCount) = V; - - TENSOR_FUNCTION(interpElementTranspose)(B, &Ue(0, element, component), &Ve(0, component, element)); - } - } - } - } - - @kernel void grad(const CeedInt elementCount, const CeedScalar *B, const CeedScalar *Bx, const CeedScalar *U, CeedScalar *V) { - for (int element = 0; element < elementCount; ++element; @outer) { - for (int component = 0; component < BASIS_COMPONENT_COUNT; ++component; @inner) { - if (!TRANSPOSE) { - const CeedScalar *Ue @dim(P1D, BASIS_COMPONENT_COUNT, elementCount) = U; - CeedScalar *Ve @dim(Q1D, elementCount, BASIS_COMPONENT_COUNT) = V; - - TENSOR_FUNCTION(gradElement)(B, Bx, &Ue(0, component, element), &Ve(0, element, component)); - } else { - const CeedScalar *Ue @dim(Q1D, elementCount, BASIS_COMPONENT_COUNT) = U; - CeedScalar *Ve @dim(P1D, BASIS_COMPONENT_COUNT, elementCount) = V; - - TENSOR_FUNCTION(gradElementTranspose)(B, Bx, &Ue(0, element, component), &Ve(0, component, element)); - } - } - } - } - - @kernel void weight(const CeedInt elementCount, const CeedScalar *qWeights1D, CeedScalar *W @dim(Q1D, elementCount)) { - @tile(32, @outer, @inner) for (int element = 0; element < elementCount; ++element) { - TENSOR_FUNCTION(weightElement)(qWeights1D, &W(0, element)); - } - } - -); diff --git a/backends/occa/kernels/tensor-basis/cpu/tensor-basis-2d.cpp b/backends/occa/kernels/tensor-basis/cpu/tensor-basis-2d.cpp deleted file mode 100644 index d150129584..0000000000 --- a/backends/occa/kernels/tensor-basis/cpu/tensor-basis-2d.cpp +++ /dev/null @@ -1,211 +0,0 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. -// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. -// -// SPDX-License-Identifier: BSD-2-Clause -// -// This file is part of CEED: http://github.com/ceed - -#include "../../kernel-defines.hpp" - -const char *occa_tensor_basis_2d_cpu_function_source = STRINGIFY_SOURCE( - - @directive("#define TENSOR_FUNCTION(FUNCTION_NAME) tensor_2d_ ## FUNCTION_NAME ## _Q ## Q1D ## _P ## P1D") - - inline void TENSOR_FUNCTION(interpElement)(const CeedScalar *B @dim(P1D, Q1D), const CeedScalar *Ue @dim(P1D, P1D), - CeedScalar *Ve @dim(Q1D, Q1D)) { - for (int qy = 0; qy < Q1D; ++qy) { - for (int qx = 0; qx < Q1D; ++qx) { - Ve(qx, qy) = 0; - } - } - - for (int py = 0; py < P1D; ++py) { - CeedScalar V_x[Q1D]; - for (int qx = 0; qx < Q1D; ++qx) { - V_x[qx] = 0; - } - - for (int px = 0; px < P1D; ++px) { - const CeedScalar Up = Ue(px, py); - for (int qx = 0; qx < Q1D; ++qx) { - V_x[qx] += B(px, qx) * Up; - } - } - - for (int qy = 0; qy < Q1D; ++qy) { - const CeedScalar w = B(py, qy); - for (int qx = 0; qx < Q1D; ++qx) { - Ve(qx, qy) += w * V_x[qx]; - } - } - } - } - - inline void TENSOR_FUNCTION(interpElementTranspose)(const CeedScalar *B @dim(P1D, Q1D), const CeedScalar *Ue @dim(Q1D, Q1D), - CeedScalar *Ve @dim(P1D, P1D)) { - for (int py = 0; py < P1D; ++py) { - for (int px = 0; px < P1D; ++px) { - Ve(px, py) = 0; - } - } - - for (int qy = 0; qy < Q1D; ++qy) { - CeedScalar V_x[P1D]; - for (int py = 0; py < P1D; ++py) { - V_x[py] = 0; - } - - for (int qx = 0; qx < Q1D; ++qx) { - const CeedScalar Up = Ue(qx, qy); - for (int px = 0; px < P1D; ++px) { - V_x[px] += B(px, qx) * Up; - } - } - - for (int py = 0; py < P1D; ++py) { - const CeedScalar w = B(py, qy); - for (int px = 0; px < P1D; ++px) { - Ve(px, py) += w * V_x[px]; - } - } - } - } - - inline void TENSOR_FUNCTION(gradElement)(const CeedScalar *B @dim(P1D, Q1D), const CeedScalar *Bx @dim(P1D, Q1D), - const CeedScalar *Ue @dim(P1D, P1D), CeedScalar *Ve_x @dim(Q1D, Q1D), CeedScalar *Ve_y @dim(Q1D, Q1D)) { - CeedScalar grad[Q1D][Q1D][2]; - for (int qy = 0; qy < Q1D; ++qy) { - for (int qx = 0; qx < Q1D; ++qx) { - grad[qy][qx][0] = 0; - grad[qy][qx][1] = 0; - } - } - - for (int py = 0; py < P1D; ++py) { - CeedScalar gradX[Q1D][2]; - for (int qx = 0; qx < Q1D; ++qx) { - gradX[qx][0] = 0; - gradX[qx][1] = 0; - } - - for (int px = 0; px < P1D; ++px) { - const CeedScalar Up = Ue(px, py); - for (int qx = 0; qx < Q1D; ++qx) { - gradX[qx][0] += Up * B(px, qx); - gradX[qx][1] += Up * Bx(px, qx); - } - } - - for (int qy = 0; qy < Q1D; ++qy) { - const CeedScalar wy = B(py, qy); - const CeedScalar wDy = Bx(py, qy); - for (int qx = 0; qx < Q1D; ++qx) { - const CeedScalar wx = gradX[qx][0]; - const CeedScalar wDx = gradX[qx][1]; - grad[qy][qx][0] += wDx * wy; - grad[qy][qx][1] += wx * wDy; - } - } - } - for (int qy = 0; qy < Q1D; ++qy) { - for (int qx = 0; qx < Q1D; ++qx) { - Ve_x(qx, qy) = grad[qy][qx][0]; - Ve_y(qx, qy) = grad[qy][qx][1]; - } - } - } - - inline void TENSOR_FUNCTION(gradElementTranspose)(const CeedScalar *B @dim(P1D, Q1D), const CeedScalar *Bx @dim(P1D, Q1D), - const CeedScalar *Ue_x @dim(Q1D, Q1D), const CeedScalar *Ue_y @dim(Q1D, Q1D), - CeedScalar *Ve @dim(P1D, P1D)) { - for (int py = 0; py < P1D; ++py) { - for (int px = 0; px < P1D; ++px) { - Ve(px, py) = 0.0; - } - } - - for (int qy = 0; qy < Q1D; ++qy) { - CeedScalar gradX[P1D][2]; - for (int px = 0; px < P1D; ++px) { - gradX[px][0] = 0; - gradX[px][1] = 0; - } - - for (int qx = 0; qx < Q1D; ++qx) { - const CeedScalar Ux = Ue_x(qx, qy); - const CeedScalar Uy = Ue_y(qx, qy); - for (int px = 0; px < P1D; ++px) { - const CeedScalar wx = B(px, qx); - const CeedScalar wDx = Bx(px, qx); - gradX[px][0] += Ux * wDx; - gradX[px][1] += Uy * wx; - } - } - - for (int py = 0; py < P1D; ++py) { - const CeedScalar wy = B(py, qy); - const CeedScalar wDy = Bx(py, qy); - for (int px = 0; px < P1D; ++px) { - Ve(px, py) += ((gradX[px][0] * wy) + (gradX[px][1] * wDy)); - } - } - } - } - - inline void TENSOR_FUNCTION(weightElement)(const CeedScalar *qWeights1D, CeedScalar *We @dim(Q1D, Q1D)) { - for (int qy = 0; qy < Q1D; ++qy) { - const CeedScalar wy = qWeights1D[qy]; - for (int qx = 0; qx < Q1D; ++qx) { - We(qx, qy) = qWeights1D[qx] * wy; - } - } - } - -); - -const char *occa_tensor_basis_2d_cpu_kernel_source = STRINGIFY_SOURCE( - - @kernel void interp(const CeedInt elementCount, const CeedScalar *B, const CeedScalar *U, CeedScalar *V) { - for (int element = 0; element < elementCount; ++element; @outer) { - for (int component = 0; component < BASIS_COMPONENT_COUNT; ++component; @inner) { - if (!TRANSPOSE) { - const CeedScalar *Ue @dim(P1D, P1D, BASIS_COMPONENT_COUNT, elementCount) = U; - CeedScalar *Ve @dim(Q1D, Q1D, elementCount, BASIS_COMPONENT_COUNT) = V; - - TENSOR_FUNCTION(interpElement)(B, &Ue(0, 0, component, element), &Ve(0, 0, element, component)); - } else { - const CeedScalar *Ue @dim(Q1D, Q1D, elementCount, BASIS_COMPONENT_COUNT) = U; - CeedScalar *Ve @dim(P1D, P1D, BASIS_COMPONENT_COUNT, elementCount) = V; - - TENSOR_FUNCTION(interpElementTranspose)(B, &Ue(0, 0, element, component), &Ve(0, 0, component, element)); - } - } - } - } - - @kernel void grad(const CeedInt elementCount, const CeedScalar *B, const CeedScalar *Bx, const CeedScalar *U, CeedScalar *V) { - for (int element = 0; element < elementCount; ++element; @outer) { - for (int component = 0; component < BASIS_COMPONENT_COUNT; ++component; @inner) { - if (!TRANSPOSE) { - const CeedScalar *Ue @dim(P1D, P1D, BASIS_COMPONENT_COUNT, elementCount) = U; - CeedScalar *Ve @dim(Q1D, Q1D, elementCount, BASIS_COMPONENT_COUNT, 2) = V; - - TENSOR_FUNCTION(gradElement)(B, Bx, &Ue(0, 0, component, element), &Ve(0, 0, element, component, 0), &Ve(0, 0, element, component, 1)); - } else { - const CeedScalar *Ue @dim(Q1D, Q1D, elementCount, BASIS_COMPONENT_COUNT, 2) = U; - CeedScalar *Ve @dim(P1D, P1D, BASIS_COMPONENT_COUNT, elementCount) = V; - - TENSOR_FUNCTION(gradElementTranspose) - (B, Bx, &Ue(0, 0, element, component, 0), &Ue(0, 0, element, component, 1), &Ve(0, 0, component, element)); - } - } - } - } - - @kernel void weight(const CeedInt elementCount, const CeedScalar *qWeights1D, CeedScalar *W @dim(Q1D, Q1D, elementCount)) { - @tile(32, @outer, @inner) for (int element = 0; element < elementCount; ++element) { - TENSOR_FUNCTION(weightElement)(qWeights1D, &W(0, 0, element)); - } - } - -); diff --git a/backends/occa/kernels/tensor-basis/cpu/tensor-basis-3d.cpp b/backends/occa/kernels/tensor-basis/cpu/tensor-basis-3d.cpp deleted file mode 100644 index 942470b85f..0000000000 --- a/backends/occa/kernels/tensor-basis/cpu/tensor-basis-3d.cpp +++ /dev/null @@ -1,306 +0,0 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. -// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. -// -// SPDX-License-Identifier: BSD-2-Clause -// -// This file is part of CEED: http://github.com/ceed - -#include "../../kernel-defines.hpp" - -const char *occa_tensor_basis_3d_cpu_function_source = STRINGIFY_SOURCE( - - @directive("#define TENSOR_FUNCTION(FUNCTION_NAME) tensor_3d_ ## FUNCTION_NAME ## _Q ## Q1D ## _P ## P1D") - - inline void TENSOR_FUNCTION(interpElement)(const CeedScalar *B @dim(P1D, Q1D), const CeedScalar *Ue @dim(P1D, P1D, P1D), - CeedScalar *Ve @dim(Q1D, Q1D, Q1D)) { - for (int qz = 0; qz < Q1D; ++qz) { - for (int qy = 0; qy < Q1D; ++qy) { - for (int qx = 0; qx < Q1D; ++qx) { - Ve(qx, qy, qz) = 0; - } - } - } - - for (int pz = 0; pz < P1D; ++pz) { - CeedScalar V_xy[Q1D][Q1D]; - for (int qy = 0; qy < Q1D; ++qy) { - for (int qx = 0; qx < Q1D; ++qx) { - V_xy[qy][qx] = 0; - } - } - - for (int py = 0; py < P1D; ++py) { - CeedScalar V_x[Q1D]; - for (int qx = 0; qx < Q1D; ++qx) { - V_x[qx] = 0; - } - - for (int px = 0; px < P1D; ++px) { - const CeedScalar Up = Ue(px, py, pz); - for (int qx = 0; qx < Q1D; ++qx) { - V_x[qx] += B(px, qx) * Up; - } - } - - for (int qy = 0; qy < Q1D; ++qy) { - const CeedScalar wy = B(py, qy); - for (int qx = 0; qx < Q1D; ++qx) { - V_xy[qy][qx] += wy * V_x[qx]; - } - } - } - - for (int qz = 0; qz < Q1D; ++qz) { - const CeedScalar wz = B(pz, qz); - for (int qy = 0; qy < Q1D; ++qy) { - for (int qx = 0; qx < Q1D; ++qx) { - Ve(qx, qy, qz) += wz * V_xy[qy][qx]; - } - } - } - } - } - - inline void TENSOR_FUNCTION(interpElementTranspose)(const CeedScalar *B @dim(P1D, Q1D), const CeedScalar *Ue @dim(Q1D, Q1D, Q1D), - CeedScalar *Ve @dim(P1D, P1D, P1D)) { - for (int pz = 0; pz < P1D; ++pz) { - for (int py = 0; py < P1D; ++py) { - for (int px = 0; px < P1D; ++px) { - Ve(px, py, pz) = 0; - } - } - } - - for (int qz = 0; qz < Q1D; ++qz) { - CeedScalar V_xy[P1D][P1D]; - for (int py = 0; py < P1D; ++py) { - for (int px = 0; px < P1D; ++px) { - V_xy[py][px] = 0; - } - } - - for (int qy = 0; qy < Q1D; ++qy) { - CeedScalar V_x[P1D]; - for (int px = 0; px < P1D; ++px) { - V_x[px] = 0; - } - - for (int qx = 0; qx < Q1D; ++qx) { - const CeedScalar Uq = Ue(qx, qy, qz); - for (int px = 0; px < P1D; ++px) { - V_x[px] += B(px, qx) * Uq; - } - } - - for (int py = 0; py < P1D; ++py) { - const CeedScalar wy = B(py, qy); - for (int px = 0; px < P1D; ++px) { - V_xy[py][px] += wy * V_x[px]; - } - } - } - - for (int pz = 0; pz < P1D; ++pz) { - const CeedScalar wz = B(pz, qz); - for (int py = 0; py < P1D; ++py) { - for (int px = 0; px < P1D; ++px) { - Ve(px, py, pz) += wz * V_xy[py][px]; - } - } - } - } - } - - inline void TENSOR_FUNCTION(gradElement)(const CeedScalar *B @dim(P1D, Q1D), const CeedScalar *Bx @dim(P1D, Q1D), - const CeedScalar *Ue @dim(P1D, P1D, P1D), CeedScalar *Ve_x @dim(Q1D, Q1D, Q1D), - CeedScalar *Ve_y @dim(Q1D, Q1D, Q1D), CeedScalar *Ve_z @dim(Q1D, Q1D, Q1D)) { - for (int qz = 0; qz < Q1D; ++qz) { - for (int qy = 0; qy < Q1D; ++qy) { - for (int qx = 0; qx < Q1D; ++qx) { - Ve_x(qx, qy, qz) = 0; - Ve_y(qx, qy, qz) = 0; - Ve_z(qx, qy, qz) = 0; - } - } - } - - for (int pz = 0; pz < P1D; ++pz) { - CeedScalar gradXY[Q1D][Q1D][3]; - for (int qy = 0; qy < Q1D; ++qy) { - for (int qx = 0; qx < Q1D; ++qx) { - gradXY[qy][qx][0] = 0; - gradXY[qy][qx][1] = 0; - gradXY[qy][qx][2] = 0; - } - } - - for (int py = 0; py < P1D; ++py) { - CeedScalar gradX[Q1D][2]; - for (int qx = 0; qx < Q1D; ++qx) { - gradX[qx][0] = 0; - gradX[qx][1] = 0; - } - - for (int px = 0; px < P1D; ++px) { - const CeedScalar Up = Ue(px, py, pz); - for (int qx = 0; qx < Q1D; ++qx) { - gradX[qx][0] += Up * B(px, qx); - gradX[qx][1] += Up * Bx(px, qx); - } - } - - for (int qy = 0; qy < Q1D; ++qy) { - const CeedScalar wy = B(py, qy); - const CeedScalar wDy = Bx(py, qy); - for (int qx = 0; qx < Q1D; ++qx) { - const CeedScalar wx = gradX[qx][0]; - const CeedScalar wDx = gradX[qx][1]; - gradXY[qy][qx][0] += wDx * wy; - gradXY[qy][qx][1] += wx * wDy; - gradXY[qy][qx][2] += wx * wy; - } - } - } - - for (int qz = 0; qz < Q1D; ++qz) { - const CeedScalar wz = B(pz, qz); - const CeedScalar wDz = Bx(pz, qz); - for (int qy = 0; qy < Q1D; ++qy) { - for (int qx = 0; qx < Q1D; ++qx) { - Ve_x(qx, qy, qz) += gradXY[qy][qx][0] * wz; - Ve_y(qx, qy, qz) += gradXY[qy][qx][1] * wz; - Ve_z(qx, qy, qz) += gradXY[qy][qx][2] * wDz; - } - } - } - } - } - - inline void TENSOR_FUNCTION(gradElementTranspose)(const CeedScalar *B @dim(P1D, Q1D), const CeedScalar *Bx @dim(P1D, Q1D), - const CeedScalar *Ue_x @dim(Q1D, Q1D, Q1D), const CeedScalar *Ue_y @dim(Q1D, Q1D, Q1D), - const CeedScalar *Ue_z @dim(Q1D, Q1D, Q1D), CeedScalar *Ve @dim(P1D, P1D, P1D)) { - for (int pz = 0; pz < P1D; ++pz) { - for (int py = 0; py < P1D; ++py) { - for (int px = 0; px < P1D; ++px) { - Ve(px, py, pz) = 0; - } - } - } - - for (int qz = 0; qz < Q1D; ++qz) { - CeedScalar gradXY[P1D][P1D][3]; - for (int py = 0; py < P1D; ++py) { - for (int px = 0; px < P1D; ++px) { - gradXY[py][px][0] = 0; - gradXY[py][px][1] = 0; - gradXY[py][px][2] = 0; - } - } - - for (int qy = 0; qy < Q1D; ++qy) { - CeedScalar gradX[P1D][3]; - for (int px = 0; px < P1D; ++px) { - gradX[px][0] = 0; - gradX[px][1] = 0; - gradX[px][2] = 0; - } - - for (int qx = 0; qx < Q1D; ++qx) { - const CeedScalar Ux = Ue_x(qx, qy, qz); - const CeedScalar Uy = Ue_y(qx, qy, qz); - const CeedScalar Uz = Ue_z(qx, qy, qz); - for (int px = 0; px < P1D; ++px) { - const CeedScalar wx = B(px, qx); - const CeedScalar wDx = Bx(px, qx); - gradX[px][0] += Ux * wDx; - gradX[px][1] += Uy * wx; - gradX[px][2] += Uz * wx; - } - } - - for (int py = 0; py < P1D; ++py) { - const CeedScalar wy = B(py, qy); - const CeedScalar wDy = Bx(py, qy); - for (int px = 0; px < P1D; ++px) { - gradXY[py][px][0] += gradX[px][0] * wy; - gradXY[py][px][1] += gradX[px][1] * wDy; - gradXY[py][px][2] += gradX[px][2] * wy; - } - } - } - - for (int pz = 0; pz < P1D; ++pz) { - const CeedScalar wz = B(pz, qz); - const CeedScalar wDz = Bx(pz, qz); - for (int py = 0; py < P1D; ++py) { - for (int px = 0; px < P1D; ++px) { - Ve(px, py, pz) += ((gradXY[py][px][0] * wz) + (gradXY[py][px][1] * wz) + (gradXY[py][px][2] * wDz)); - } - } - } - } - } - - inline void TENSOR_FUNCTION(weightElement)(const CeedScalar *qWeights1D, CeedScalar *We @dim(Q1D, Q1D, Q1D)) { - for (int qz = 0; qz < Q1D; ++qz) { - const CeedScalar wz = qWeights1D[qz]; - for (int qy = 0; qy < Q1D; ++qy) { - const CeedScalar wy = qWeights1D[qy]; - for (int qx = 0; qx < Q1D; ++qx) { - We(qx, qy, qz) = qWeights1D[qx] * wy * wz; - } - } - } - } - -); - -const char *occa_tensor_basis_3d_cpu_kernel_source = STRINGIFY_SOURCE( - - @kernel void interp(const CeedInt elementCount, const CeedScalar *B, const CeedScalar *U, CeedScalar *V) { - for (int element = 0; element < elementCount; ++element; @outer) { - for (int component = 0; component < BASIS_COMPONENT_COUNT; ++component; @inner) { - if (!TRANSPOSE) { - const CeedScalar *Ue @dim(P1D, P1D, P1D, BASIS_COMPONENT_COUNT, elementCount) = U; - CeedScalar *Ve @dim(Q1D, Q1D, Q1D, elementCount, BASIS_COMPONENT_COUNT) = V; - - TENSOR_FUNCTION(interpElement)(B, &Ue(0, 0, 0, component, element), &Ve(0, 0, 0, element, component)); - } else { - const CeedScalar *Ue @dim(Q1D, Q1D, Q1D, elementCount, BASIS_COMPONENT_COUNT) = U; - CeedScalar *Ve @dim(P1D, P1D, P1D, BASIS_COMPONENT_COUNT, elementCount) = V; - - TENSOR_FUNCTION(interpElementTranspose)(B, &Ue(0, 0, 0, element, component), &Ve(0, 0, 0, component, element)); - } - } - } - } - - @kernel void grad(const CeedInt elementCount, const CeedScalar *B, const CeedScalar *Bx, const CeedScalar *U, CeedScalar *V) { - for (int element = 0; element < elementCount; ++element; @outer) { - for (int component = 0; component < BASIS_COMPONENT_COUNT; ++component; @inner) { - if (!TRANSPOSE) { - const CeedScalar *Ue @dim(P1D, P1D, P1D, BASIS_COMPONENT_COUNT, elementCount) = U; - CeedScalar *Ve @dim(Q1D, Q1D, Q1D, elementCount, BASIS_COMPONENT_COUNT, 3) = V; - - TENSOR_FUNCTION(gradElement) - (B, Bx, &Ue(0, 0, 0, component, element), &Ve(0, 0, 0, element, component, 0), &Ve(0, 0, 0, element, component, 1), - &Ve(0, 0, 0, element, component, 2)); - } else { - const CeedScalar *Ue @dim(Q1D, Q1D, Q1D, elementCount, BASIS_COMPONENT_COUNT, 3) = U; - CeedScalar *Ve @dim(P1D, P1D, P1D, BASIS_COMPONENT_COUNT, elementCount) = V; - - TENSOR_FUNCTION(gradElementTranspose) - (B, Bx, &Ue(0, 0, 0, element, component, 0), &Ue(0, 0, 0, element, component, 1), &Ue(0, 0, 0, element, component, 2), - &Ve(0, 0, 0, component, element)); - } - } - } - } - - @kernel void weight(const CeedInt elementCount, const CeedScalar *qWeights1D, CeedScalar *W @dim(Q1D, Q1D, Q1D, elementCount)) { - @tile(32, @outer, @inner) for (int element = 0; element < elementCount; ++element) { - TENSOR_FUNCTION(weightElement)(qWeights1D, &W(0, 0, 0, element)); - } - } - -); diff --git a/backends/occa/kernels/tensor-basis/gpu/tensor-basis-1d.cpp b/backends/occa/kernels/tensor-basis/gpu/tensor-basis-1d.cpp deleted file mode 100644 index 34377f29b9..0000000000 --- a/backends/occa/kernels/tensor-basis/gpu/tensor-basis-1d.cpp +++ /dev/null @@ -1,118 +0,0 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. -// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. -// -// SPDX-License-Identifier: BSD-2-Clause -// -// This file is part of CEED: http://github.com/ceed - -#include "../../kernel-defines.hpp" - -const char *occa_tensor_basis_1d_gpu_source = STRINGIFY_SOURCE( - - typedef CeedScalar * dofArray @dim(P1D, BASIS_COMPONENT_COUNT, elementCount); - typedef const CeedScalar *const_dofArray @dim(P1D, BASIS_COMPONENT_COUNT, elementCount); - - typedef CeedScalar * quadArray @dim(Q1D, elementCount, BASIS_COMPONENT_COUNT); - typedef const CeedScalar *const_quadArray @dim(Q1D, elementCount, BASIS_COMPONENT_COUNT); - - typedef CeedScalar * sharedBufferArray @dim(MAX_PQ, ELEMENTS_PER_BLOCK); typedef const CeedScalar *quadToDof @dim(P1D, Q1D); - typedef CeedScalar * elementWeightArray @dim(Q1D, elementCount); - - //---[ Utility Methods ]---------------- - inline void readDofs(const int element, const int localElement, const int component, const int p, const_dofArray U, - sharedBufferArray sharedBuffer) { - // Zero out extra entries - sharedBuffer(p, localElement) = ((p < P1D) ? U(p, component, element) : 0.0); - } - - inline void writeDofs(const int element, const int component, const int p, const CeedScalar Vp, dofArray V) { - if (p < P1D) { - V(p, component, element) = Vp; - } - } - - inline void readQuads(const int elementCount, const int element, const int localElement, const int component, const int q, const_quadArray U, - sharedBufferArray sharedBuffer) { sharedBuffer(q, localElement) = U(q, element, component); } - - inline void writeQuads(const int elementCount, const int element, const int component, const int q, const CeedScalar Vq, quadArray V) { - V(q, element, component) = Vq; - } - - inline void contractX(const int q, const int localElement, sharedBufferArray sharedBuffer, quadToDof B, CeedScalar &V) { - V = 0.0; - for (int p = 0; p < P1D; ++p) { - V += B(p, q) * sharedBuffer(p, localElement); - } - } - - inline void contractTransposeX(const int p, const int localElement, sharedBufferArray sharedBuffer, quadToDof B, CeedScalar &V) { - V = 0.0; - for (int q = 0; q < Q1D; ++q) { - V += B(p, q) * sharedBuffer(q, localElement); - } - } - - //---[ Kernels ]------------------------ - @kernel void interp(const CeedInt elementCount, quadToDof B, const CeedScalar *U, CeedScalar *V) { - for (int elementOffset = 0; elementOffset < elementCount; elementOffset += ELEMENTS_PER_BLOCK; @outer) { - @shared CeedScalar sharedBuffer[MAX_PQ * ELEMENTS_PER_BLOCK]; - - for (int localElement = 0; localElement < ELEMENTS_PER_BLOCK; ++localElement; @inner) { - for (int q = 0; q < Q1D; ++q; @inner) { - const int element = elementOffset + localElement; - if (element < elementCount) { - for (int component = 0; component < BASIS_COMPONENT_COUNT; ++component) { - CeedScalar r; - if (!TRANSPOSE) { - readDofs(element, localElement, component, q, U, sharedBuffer); - contractX(q, localElement, sharedBuffer, B, r); - writeQuads(elementCount, element, component, q, r, V); - } else { - readQuads(elementCount, element, localElement, component, q, U, sharedBuffer); - contractTransposeX(q, localElement, sharedBuffer, B, r); - writeDofs(element, component, q, r, V); - } - } - } - } - } - } - } - - @kernel void grad(const CeedInt elementCount, quadToDof B, quadToDof Bx, const CeedScalar *U, CeedScalar *V) { - for (int elementOffset = 0; elementOffset < elementCount; elementOffset += ELEMENTS_PER_BLOCK; @outer) { - @shared CeedScalar sharedBuffer[MAX_PQ * ELEMENTS_PER_BLOCK]; - - for (int localElement = 0; localElement < ELEMENTS_PER_BLOCK; ++localElement; @inner) { - for (int q = 0; q < Q1D; ++q; @inner) { - const int element = elementOffset + localElement; - if (element < elementCount) { - for (int component = 0; component < BASIS_COMPONENT_COUNT; ++component) { - CeedScalar r; - if (!TRANSPOSE) { - readDofs(element, localElement, component, q, U, sharedBuffer); - contractX(q, localElement, sharedBuffer, Bx, r); - writeQuads(elementCount, element, component, q, r, V); - } else { - readQuads(elementCount, element, localElement, component, q, U, sharedBuffer); - contractTransposeX(q, localElement, sharedBuffer, Bx, r); - writeDofs(element, component, q, r, V); - } - } - } - } - } - } - } - - @kernel void weight(const CeedInt elementCount, const CeedScalar *qWeights1D, elementWeightArray W) { - for (int elementOffset = 0; elementOffset < elementCount; elementOffset += ELEMENTS_PER_BLOCK; @outer) { - for (int element = elementOffset; element < (elementOffset + ELEMENTS_PER_BLOCK); ++element; @outer) { - for (int q = 0; q < Q1D; ++q; @inner) { - W(q, element) = qWeights1D[q]; - } - } - } - } - -); diff --git a/backends/occa/kernels/tensor-basis/gpu/tensor-basis-2d.cpp b/backends/occa/kernels/tensor-basis/gpu/tensor-basis-2d.cpp deleted file mode 100644 index 4d99490306..0000000000 --- a/backends/occa/kernels/tensor-basis/gpu/tensor-basis-2d.cpp +++ /dev/null @@ -1,162 +0,0 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. -// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. -// -// SPDX-License-Identifier: BSD-2-Clause -// -// This file is part of CEED: http://github.com/ceed - -#include "../../kernel-defines.hpp" - -const char *occa_tensor_basis_2d_gpu_source = STRINGIFY_SOURCE( - - typedef CeedScalar * dofArray @dim(P1D, P1D, BASIS_COMPONENT_COUNT, elementCount); - typedef const CeedScalar *const_dofArray @dim(P1D, P1D, BASIS_COMPONENT_COUNT, elementCount); - - typedef CeedScalar * quadArray @dim(Q1D, Q1D, elementCount, BASIS_COMPONENT_COUNT, 2); - typedef const CeedScalar *const_quadArray @dim(Q1D, Q1D, elementCount, BASIS_COMPONENT_COUNT, 2); - - typedef CeedScalar * sharedBufferArray @dim(MAX_PQ, MAX_PQ, ELEMENTS_PER_BLOCK); typedef const CeedScalar *quadToDof @dim(P1D, Q1D); - typedef CeedScalar * elementWeightArray @dim(Q1D, Q1D, elementCount); - - //---[ Utility Methods ]---------------- - inline void readDofs(const int element, const int component, const int px, const int py, const_dofArray U, CeedScalar *Up) { - // Zero out extra entries - *Up = ((px < P1D) && (py < P1D) ? U(px, py, component, element) : 0.0); - } - - inline void writeDofs(const int element, const int component, const int px, const int py, const CeedScalar Vp, dofArray V) { - if ((px < P1D) && (py < P1D)) { - V(px, py, component, element) = Vp; - } - } - - inline void readQuads(const int elementCount, const int element, const int component, const int qx, const int qy, const int dim, - const_quadArray U, CeedScalar *Uq) { *Uq = U(qx, qy, element, component, dim); } - - inline void writeQuads(const int elementCount, const int element, const int component, const int qx, const int qy, const int dim, - const CeedScalar Vq, quadArray V) { V(qx, qy, element, component, dim) = Vq; } - - inline void contractX(const int qx, const int qy, const int localElement, sharedBufferArray sharedBuffer, quadToDof B, const CeedScalar U, - CeedScalar *V) { - sharedBuffer(qx, qy, localElement) = U; - *V = 0.0; - @barrier(); - for (int p = 0; p < P1D; ++p) { - *V += B(p, qx) * sharedBuffer(p, qy, localElement); - } - @barrier(); - } - - inline void contractY(const int qx, const int qy, const int localElement, sharedBufferArray sharedBuffer, quadToDof B, const CeedScalar U, - CeedScalar *V) { - sharedBuffer(qx, qy, localElement) = U; - *V = 0.0; - @barrier(); - for (int p = 0; p < P1D; ++p) { - *V += B(p, qy) * sharedBuffer(qx, p, localElement); - } - @barrier(); - } - - inline void contractTransposeX(const int px, const int py, const int localElement, sharedBufferArray sharedBuffer, quadToDof B, - const CeedScalar U, CeedScalar *V) { - sharedBuffer(px, py, localElement) = U; - *V = 0.0; - @barrier(); - for (int q = 0; q < Q1D; ++q) { - *V += B(px, q) * sharedBuffer(q, py, localElement); - } - @barrier(); - } - - inline void contractTransposeY(const int px, const int py, const int localElement, sharedBufferArray sharedBuffer, quadToDof B, - const CeedScalar U, CeedScalar *V) { - sharedBuffer(px, py, localElement) = U; - *V = 0.0; - @barrier(); - for (int q = 0; q < Q1D; ++q) { - *V += B(py, q) * sharedBuffer(px, q, localElement); - } - @barrier(); - } - - //---[ Kernels ]------------------------ - @kernel void interp(const CeedInt elementCount, quadToDof B, const CeedScalar *U, CeedScalar *V) { - for (int elementOffset = 0; elementOffset < elementCount; elementOffset += ELEMENTS_PER_BLOCK; @outer) { - @shared CeedScalar sharedBuffer[MAX_PQ * MAX_PQ * ELEMENTS_PER_BLOCK]; - - for (int localElement = 0; localElement < ELEMENTS_PER_BLOCK; ++localElement; @inner) { - const int element = elementOffset + localElement; - for (int qy = 0; qy < Q1D; ++qy; @inner) { - for (int qx = 0; qx < Q1D; ++qx; @inner) { - if (element < elementCount) { - for (int component = 0; component < BASIS_COMPONENT_COUNT; ++component) { - CeedScalar r1, r2; - if (!TRANSPOSE) { - readDofs(element, component, qx, qy, U, &r1); - contractX(qx, qy, localElement, sharedBuffer, B, r1, &r2); - contractY(qx, qy, localElement, sharedBuffer, B, r2, &r1); - writeQuads(elementCount, element, component, qx, qy, 0, r1, V); - } else { - readQuads(elementCount, element, component, qx, qy, 0, U, &r1); - contractTransposeY(qx, qy, localElement, sharedBuffer, B, r1, &r2); - contractTransposeX(qx, qy, localElement, sharedBuffer, B, r2, &r1); - writeDofs(element, component, qx, qy, r1, V); - } - } - } - } - } - } - } - } - - @kernel void grad(const CeedInt elementCount, quadToDof B, quadToDof Bx, const CeedScalar *U, CeedScalar *V) { - for (int elementOffset = 0; elementOffset < elementCount; elementOffset += ELEMENTS_PER_BLOCK; @outer) { - @shared CeedScalar sharedBuffer[MAX_PQ * MAX_PQ * ELEMENTS_PER_BLOCK]; - - for (int localElement = 0; localElement < ELEMENTS_PER_BLOCK; ++localElement; @inner) { - const int element = elementOffset + localElement; - for (int qy = 0; qy < Q1D; ++qy; @inner) { - for (int qx = 0; qx < Q1D; ++qx; @inner) { - if (element < elementCount) { - for (int component = 0; component < BASIS_COMPONENT_COUNT; ++component) { - CeedScalar r1, r2, r3; - if (!TRANSPOSE) { - readDofs(element, component, qx, qy, U, &r1); - contractX(qx, qy, localElement, sharedBuffer, Bx, r1, &r2); - contractY(qx, qy, localElement, sharedBuffer, B, r2, &r3); - writeQuads(elementCount, element, component, qx, qy, 0, r3, V); - contractX(qx, qy, localElement, sharedBuffer, B, r1, &r2); - contractY(qx, qy, localElement, sharedBuffer, Bx, r2, &r3); - writeQuads(elementCount, element, component, qx, qy, 1, r3, V); - } else { - readQuads(elementCount, element, component, qx, qy, 0, U, &r1); - contractTransposeY(qx, qy, localElement, sharedBuffer, B, r1, &r2); - contractTransposeX(qx, qy, localElement, sharedBuffer, Bx, r2, &r3); - readQuads(elementCount, element, component, qx, qy, 1, U, &r1); - contractTransposeY(qx, qy, localElement, sharedBuffer, Bx, r1, &r2); - contractTransposeX(qx, qy, localElement, sharedBuffer, B, r2, &r1); - writeDofs(element, component, qx, qy, r1 + r3, V); - } - } - } - } - } - } - } - } - - @kernel void weight(const CeedInt elementCount, const CeedScalar *qWeights1D, elementWeightArray W) { - for (int elementOffset = 0; elementOffset < elementCount; elementOffset += ELEMENTS_PER_BLOCK; @outer) { - for (int element = elementOffset; element < (elementOffset + ELEMENTS_PER_BLOCK); ++element; @outer) { - for (int qy = 0; qy < Q1D; ++qy; @inner) { - for (int qx = 0; qx < Q1D; ++qx; @inner) { - W(qx, qy, element) = qWeights1D[qx] * qWeights1D[qy]; - } - } - } - } - } - -); diff --git a/backends/occa/kernels/tensor-basis/gpu/tensor-basis-3d.cpp b/backends/occa/kernels/tensor-basis/gpu/tensor-basis-3d.cpp deleted file mode 100644 index 3b59827a8e..0000000000 --- a/backends/occa/kernels/tensor-basis/gpu/tensor-basis-3d.cpp +++ /dev/null @@ -1,237 +0,0 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. -// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. -// -// SPDX-License-Identifier: BSD-2-Clause -// -// This file is part of CEED: http://github.com/ceed - -#include "../../kernel-defines.hpp" - -const char *occa_tensor_basis_3d_gpu_source = STRINGIFY_SOURCE( - - typedef CeedScalar * dofArray @dim(P1D, P1D, P1D, BASIS_COMPONENT_COUNT, elementCount); - typedef const CeedScalar *const_dofArray @dim(P1D, P1D, P1D, BASIS_COMPONENT_COUNT, elementCount); - - typedef CeedScalar * quadArray @dim(Q1D, Q1D, Q1D, elementCount, BASIS_COMPONENT_COUNT, 3); - typedef const CeedScalar *const_quadArray @dim(Q1D, Q1D, Q1D, elementCount, BASIS_COMPONENT_COUNT, 3); - - typedef CeedScalar * sharedBufferArray @dim(MAX_PQ, MAX_PQ, BASIS_COMPONENT_COUNT); typedef const CeedScalar *quadToDof @dim(P1D, Q1D); - typedef CeedScalar * elementWeightArray @dim(Q1D, Q1D, Q1D, elementCount); - - //---[ Utility Methods ]---------------- - inline void add(const CeedScalar *U, CeedScalar *V) { - for (int q = 0; q < Q1D; q++) { - V[q] += U[q]; - } - } - - inline void readDofs(const int element, const int component, const int px, const int py, const_dofArray U, CeedScalar *Up) { - // Zero out extra entries - for (int pz = 0; pz < P1D; ++pz) { - Up[pz] = ((px < P1D) && (py < P1D) ? U(px, py, pz, component, element) : 0.0); - } - for (int q = P1D; q < Q1D; ++q) { - Up[q] = 0.0; - } - } - - inline void writeDofs(const int element, const int component, const int px, const int py, const CeedScalar *Vp, dofArray V) { - if ((px < P1D) && (py < P1D)) { - for (int pz = 0; pz < P1D; ++pz) { - V(px, py, pz, component, element) = Vp[pz]; - } - } - } - - inline void readQuads(const int elementCount, const int element, const int component, const int qx, const int qy, const int dim, - const_quadArray U, CeedScalar *Uq) { - for (int qz = 0; qz < Q1D; ++qz) { - Uq[qz] = U(qx, qy, qz, element, component, dim); - } - } - - inline void writeQuads(const int elementCount, const int element, const int component, const int qx, const int qy, const int dim, - const CeedScalar *Vq, quadArray V) { - for (int qz = 0; qz < Q1D; ++qz) { - V(qx, qy, qz, element, component, dim) = Vq[qz]; - } - } - - inline void contractX(const int qx, const int qy, const int component, sharedBufferArray sharedBuffer, quadToDof B, const CeedScalar *Uq, - CeedScalar *Vp) { - for (int pz = 0; pz < P1D; ++pz) { - sharedBuffer(qx, qy, component) = Uq[pz]; - Vp[pz] = 0.0; - @barrier(); - for (int p = 0; p < P1D; ++p) { - Vp[pz] += B(p, qx) * sharedBuffer(p, qy, component); - } - @barrier(); - } - } - - inline void contractY(const int qx, const int qy, const int component, sharedBufferArray sharedBuffer, quadToDof B, const CeedScalar *Uq, - CeedScalar *Vp) { - for (int pz = 0; pz < P1D; ++pz) { - sharedBuffer(qx, qy, component) = Uq[pz]; - Vp[pz] = 0.0; - @barrier(); - for (int p = 0; p < P1D; ++p) { - Vp[pz] += B(p, qy) * sharedBuffer(qx, p, component); - } - @barrier(); - } - } - - inline void contractZ(const int qx, const int qy, quadToDof B, const CeedScalar *Up, CeedScalar *Vq) { - for (int qz = 0; qz < Q1D; ++qz) { - Vq[qz] = 0.0; - for (int p = 0; p < P1D; ++p) { - Vq[qz] += B(p, qz) * Up[p]; - } - } - } - - inline void contractTransposeX(const int px, const int py, const int component, sharedBufferArray sharedBuffer, quadToDof B, const CeedScalar *Up, - CeedScalar *Vp) { - for (int pz = 0; pz < P1D; ++pz) { - sharedBuffer(px, py, component) = Up[pz]; - Vp[pz] = 0.0; - @barrier(); - if (px < P1D) { - for (int qx = 0; qx < Q1D; ++qx) { - Vp[pz] += B(px, qx) * sharedBuffer(qx, py, component); - } - } - @barrier(); - } - } - - inline void contractTransposeY(const int px, const int py, const int component, sharedBufferArray sharedBuffer, quadToDof B, const CeedScalar *Up, - CeedScalar *Vp) { - for (int pz = 0; pz < P1D; ++pz) { - sharedBuffer(px, py, component) = Up[pz]; - Vp[pz] = 0.0; - @barrier(); - if (py < P1D) { - for (int qy = 0; qy < Q1D; ++qy) { - Vp[pz] += B(py, qy) * sharedBuffer(px, qy, component); - } - } - @barrier(); - } - } - - inline void contractTransposeZ(const int px, const int py, quadToDof B, const CeedScalar *Uq, CeedScalar *Vq) { - for (int pz = 0; pz < P1D; ++pz) { - Vq[pz] = 0.0; - for (int qz = 0; qz < Q1D; ++qz) { - Vq[pz] += B(pz, qz) * Uq[qz]; - } - } - } - - //---[ Kernels ]------------------------ - @kernel void interp(const CeedInt elementCount, quadToDof B, const CeedScalar *U, CeedScalar *V) { - for (int element = 0; element < elementCount; ++element; @outer) { - @shared CeedScalar sharedBuffer[MAX_PQ * MAX_PQ * BASIS_COMPONENT_COUNT]; - - for (int component = 0; component < BASIS_COMPONENT_COUNT; ++component; @inner) { - for (int qy = 0; qy < Q1D; ++qy; @inner) { - for (int qx = 0; qx < Q1D; ++qx; @inner) { - if (element < elementCount) { - CeedScalar r1[MAX_PQ], r2[MAX_PQ]; - for (int q = 0; q < Q1D; ++q) { - r1[q] = 0.0; - r2[q] = 0.0; - } - - if (!TRANSPOSE) { - readDofs(element, component, qx, qy, U, r1); - contractX(qx, qy, component, sharedBuffer, B, r1, r2); - contractY(qx, qy, component, sharedBuffer, B, r2, r1); - contractZ(qx, qy, B, r1, r2); - writeQuads(elementCount, element, component, qx, qy, 0, r2, V); - } else { - readQuads(elementCount, element, component, qx, qy, 0, U, r1); - contractTransposeZ(qx, qy, B, r1, r2); - contractTransposeY(qx, qy, component, sharedBuffer, B, r2, r1); - contractTransposeX(qx, qy, component, sharedBuffer, B, r1, r2); - writeDofs(element, component, qx, qy, r2, V); - } - } - } - } - } - } - } - - @kernel void grad(const CeedInt elementCount, quadToDof B, quadToDof Bx, const CeedScalar *U, CeedScalar *V) { - for (int element = 0; element < elementCount; ++element; @outer) { - @shared CeedScalar sharedBuffer[MAX_PQ * MAX_PQ * BASIS_COMPONENT_COUNT]; - - for (int component = 0; component < BASIS_COMPONENT_COUNT; ++component; @inner) { - for (int qy = 0; qy < Q1D; ++qy; @inner) { - for (int qx = 0; qx < Q1D; ++qx; @inner) { - if (element < elementCount) { - CeedScalar r1[MAX_PQ], r2[MAX_PQ], r3[MAX_PQ]; - - if (!TRANSPOSE) { - readDofs(element, component, qx, qy, U, r1); - // Dx - contractX(qx, qy, component, sharedBuffer, Bx, r1, r2); - contractY(qx, qy, component, sharedBuffer, B, r2, r3); - contractZ(qx, qy, B, r3, r2); - writeQuads(elementCount, element, component, qx, qy, 0, r2, V); - // Dy - contractX(qx, qy, component, sharedBuffer, B, r1, r2); - contractY(qx, qy, component, sharedBuffer, Bx, r2, r3); - contractZ(qx, qy, B, r3, r2); - writeQuads(elementCount, element, component, qx, qy, 1, r2, V); - // Dz - contractX(qx, qy, component, sharedBuffer, B, r1, r2); - contractY(qx, qy, component, sharedBuffer, B, r2, r3); - contractZ(qx, qy, Bx, r3, r2); - writeQuads(elementCount, element, component, qx, qy, 2, r2, V); - } else { - // Dx - readQuads(elementCount, element, component, qx, qy, 0, U, r1); - contractTransposeZ(qx, qy, B, r1, r3); - contractTransposeY(qx, qy, component, sharedBuffer, B, r3, r1); - contractTransposeX(qx, qy, component, sharedBuffer, Bx, r1, r2); - // Dy - readQuads(elementCount, element, component, qx, qy, 1, U, r1); - contractTransposeZ(qx, qy, B, r1, r3); - contractTransposeY(qx, qy, component, sharedBuffer, Bx, r3, r1); - contractTransposeX(qx, qy, component, sharedBuffer, B, r1, r3); - add(r3, r2); - // Dz - readQuads(elementCount, element, component, qx, qy, 2, U, r1); - contractTransposeZ(qx, qy, Bx, r1, r3); - contractTransposeY(qx, qy, component, sharedBuffer, B, r3, r1); - contractTransposeX(qx, qy, component, sharedBuffer, B, r1, r3); - add(r3, r2); - writeDofs(element, component, qx, qy, r2, V); - } - } - } - } - } - } - } - - @kernel void weight(const CeedInt elementCount, const CeedScalar *qWeights1D, elementWeightArray W) { - for (int element = 0; element < elementCount; ++element; @outer) { - for (int qz = 0; qz < Q1D; ++qz; @inner) { - for (int qy = 0; qy < Q1D; ++qy; @inner) { - for (int qx = 0; qx < Q1D; ++qx) { - if (element < elementCount) { - W(qx, qy, qz, element) = qWeights1D[qx] * qWeights1D[qy] * qWeights1D[qz]; - } - } - } - } - } - } - -); diff --git a/backends/opt/ceed-opt-blocked.c b/backends/opt/ceed-opt-blocked.c index 6b0125f2fa..e8980c3ba9 100644 --- a/backends/opt/ceed-opt-blocked.c +++ b/backends/opt/ceed-opt-blocked.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -37,6 +37,7 @@ static int CeedInit_Opt_Blocked(const char *resource, Ceed ceed) { // Create reference Ceed that implementation will be dispatched through unless overridden CeedCallBackend(CeedInit("/cpu/self/ref/serial", &ceed_ref)); CeedCallBackend(CeedSetDelegate(ceed, ceed_ref)); + CeedCallBackend(CeedDestroy(&ceed_ref)); CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "Destroy", CeedDestroy_Opt)); CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "TensorContractCreate", CeedTensorContractCreate_Opt)); diff --git a/backends/opt/ceed-opt-operator.c b/backends/opt/ceed-opt-operator.c index 92f1e7ad07..5f072d1e25 100644 --- a/backends/opt/ceed-opt-operator.c +++ b/backends/opt/ceed-opt-operator.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -16,9 +16,9 @@ //------------------------------------------------------------------------------ // Setup Input/Output Fields //------------------------------------------------------------------------------ -static int CeedOperatorSetupFields_Opt(CeedQFunction qf, CeedOperator op, bool is_input, const CeedInt block_size, CeedElemRestriction *block_rstr, - CeedVector *e_vecs_full, CeedVector *e_vecs, CeedVector *q_vecs, CeedInt start_e, CeedInt num_fields, - CeedInt Q) { +static int CeedOperatorSetupFields_Opt(CeedQFunction qf, CeedOperator op, bool is_input, bool *skip_rstr, bool *apply_add_basis, + const CeedInt block_size, CeedElemRestriction *block_rstr, CeedVector *e_vecs_full, CeedVector *e_vecs, + CeedVector *q_vecs, CeedInt start_e, CeedInt num_fields, CeedInt Q) { Ceed ceed; CeedSize e_size, q_size; CeedInt num_comp, size, P; @@ -30,7 +30,8 @@ static int CeedOperatorSetupFields_Opt(CeedQFunction qf, CeedOperator op, bool i CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); CeedCallBackend(CeedGetParent(ceed, &ceed_parent)); - if (ceed_parent) ceed = ceed_parent; + CeedCallBackend(CeedReferenceCopy(ceed_parent, &ceed)); + CeedCallBackend(CeedDestroy(&ceed_parent)); } if (is_input) { CeedCallBackend(CeedOperatorGetFields(op, NULL, &op_fields, NULL, NULL)); @@ -101,10 +102,14 @@ static int CeedOperatorSetupFields_Opt(CeedQFunction qf, CeedOperator op, bool i CeedCallBackend(CeedElemRestrictionCreateBlockedStrided(ceed_rstr, num_elem, elem_size, block_size, num_comp, l_size, strides, &block_rstr[i + start_e])); } break; + // LCOV_EXCL_START case CEED_RESTRICTION_POINTS: // Empty case - won't occur break; + // LCOV_EXCL_STOP } + CeedCallBackend(CeedDestroy(&ceed_rstr)); + CeedCallBackend(CeedElemRestrictionDestroy(&rstr)); CeedCallBackend(CeedElemRestrictionCreateVector(block_rstr[i + start_e], NULL, &e_vecs_full[i + start_e])); } @@ -124,6 +129,7 @@ static int CeedOperatorSetupFields_Opt(CeedQFunction qf, CeedOperator op, bool i CeedCallBackend(CeedQFunctionFieldGetSize(qf_fields[i], &size)); CeedCallBackend(CeedBasisGetNumNodes(basis, &P)); CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); + CeedCallBackend(CeedBasisDestroy(&basis)); e_size = (CeedSize)P * num_comp * block_size; CeedCallBackend(CeedVectorCreate(ceed, e_size, &e_vecs[i])); q_size = (CeedSize)Q * size * block_size; @@ -134,11 +140,64 @@ static int CeedOperatorSetupFields_Opt(CeedQFunction qf, CeedOperator op, bool i q_size = (CeedSize)Q * block_size; CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i])); CeedCallBackend(CeedBasisApply(basis, block_size, CEED_NOTRANSPOSE, CEED_EVAL_WEIGHT, CEED_VECTOR_NONE, q_vecs[i])); + CeedCallBackend(CeedBasisDestroy(&basis)); break; } // Initialize E-vec arrays if (e_vecs[i]) CeedCallBackend(CeedVectorSetValue(e_vecs[i], 0.0)); } + // Drop duplicate restrictions + if (is_input) { + for (CeedInt i = 0; i < num_fields; i++) { + CeedVector vec_i; + CeedElemRestriction rstr_i; + + CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec_i)); + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &rstr_i)); + for (CeedInt j = i + 1; j < num_fields; j++) { + CeedVector vec_j; + CeedElemRestriction rstr_j; + + CeedCallBackend(CeedOperatorFieldGetVector(op_fields[j], &vec_j)); + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[j], &rstr_j)); + if (vec_i == vec_j && rstr_i == rstr_j) { + CeedCallBackend(CeedVectorReferenceCopy(e_vecs[i], &e_vecs[j])); + CeedCallBackend(CeedVectorReferenceCopy(e_vecs_full[i + start_e], &e_vecs_full[j + start_e])); + skip_rstr[j] = true; + } + CeedCallBackend(CeedVectorDestroy(&vec_j)); + CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j)); + } + CeedCallBackend(CeedVectorDestroy(&vec_i)); + CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i)); + } + } else { + for (CeedInt i = num_fields - 1; i >= 0; i--) { + CeedVector vec_i; + CeedElemRestriction rstr_i; + + CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec_i)); + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &rstr_i)); + for (CeedInt j = i - 1; j >= 0; j--) { + CeedVector vec_j; + CeedElemRestriction rstr_j; + + CeedCallBackend(CeedOperatorFieldGetVector(op_fields[j], &vec_j)); + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[j], &rstr_j)); + if (vec_i == vec_j && rstr_i == rstr_j) { + CeedCallBackend(CeedVectorReferenceCopy(e_vecs[i], &e_vecs[j])); + CeedCallBackend(CeedVectorReferenceCopy(e_vecs_full[i + start_e], &e_vecs_full[j + start_e])); + skip_rstr[j] = true; + apply_add_basis[i] = true; + } + CeedCallBackend(CeedVectorDestroy(&vec_j)); + CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j)); + } + CeedCallBackend(CeedVectorDestroy(&vec_i)); + CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i)); + } + } + CeedCallBackend(CeedDestroy(&ceed)); return CEED_ERROR_SUCCESS; } @@ -160,6 +219,7 @@ static int CeedOperatorSetup_Opt(CeedOperator op) { CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); CeedCallBackend(CeedGetData(ceed, &ceed_impl)); + CeedCallBackend(CeedDestroy(&ceed)); CeedCallBackend(CeedOperatorGetData(op, &impl)); CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q)); @@ -172,6 +232,9 @@ static int CeedOperatorSetup_Opt(CeedOperator op) { CeedCallBackend(CeedCalloc(num_input_fields + num_output_fields, &impl->block_rstr)); CeedCallBackend(CeedCalloc(num_input_fields + num_output_fields, &impl->e_vecs_full)); + CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->skip_rstr_in)); + CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->skip_rstr_out)); + CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->apply_add_basis_out)); CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->input_states)); CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->e_vecs_in)); CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->e_vecs_out)); @@ -183,11 +246,11 @@ static int CeedOperatorSetup_Opt(CeedOperator op) { // Set up infield and outfield pointer arrays // Infields - CeedCallBackend(CeedOperatorSetupFields_Opt(qf, op, true, block_size, impl->block_rstr, impl->e_vecs_full, impl->e_vecs_in, impl->q_vecs_in, 0, - num_input_fields, Q)); + CeedCallBackend(CeedOperatorSetupFields_Opt(qf, op, true, impl->skip_rstr_in, NULL, block_size, impl->block_rstr, impl->e_vecs_full, + impl->e_vecs_in, impl->q_vecs_in, 0, num_input_fields, Q)); // Outfields - CeedCallBackend(CeedOperatorSetupFields_Opt(qf, op, false, block_size, impl->block_rstr, impl->e_vecs_full, impl->e_vecs_out, impl->q_vecs_out, - num_input_fields, num_output_fields, Q)); + CeedCallBackend(CeedOperatorSetupFields_Opt(qf, op, false, impl->skip_rstr_out, impl->apply_add_basis_out, block_size, impl->block_rstr, + impl->e_vecs_full, impl->e_vecs_out, impl->q_vecs_out, num_input_fields, num_output_fields, Q)); // Identity QFunctions if (impl->is_identity_qf) { @@ -206,6 +269,7 @@ static int CeedOperatorSetup_Opt(CeedOperator op) { } CeedCallBackend(CeedOperatorSetSetupDone(op)); + CeedCallBackend(CeedQFunctionDestroy(&qf)); return CEED_ERROR_SUCCESS; } @@ -216,22 +280,23 @@ static inline int CeedOperatorSetupInputs_Opt(CeedInt num_input_fields, CeedQFun CeedVector in_vec, CeedScalar *e_data[2 * CEED_FIELD_MAX], CeedOperator_Opt *impl, CeedRequest *request) { for (CeedInt i = 0; i < num_input_fields; i++) { - uint64_t state; CeedEvalMode eval_mode; - CeedVector vec; CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); if (eval_mode == CEED_EVAL_WEIGHT) { // Skip } else { + uint64_t state; + CeedVector vec; + // Get input vector CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); if (vec != CEED_VECTOR_ACTIVE) { // Restrict CeedCallBackend(CeedVectorGetState(vec, &state)); - if (state != impl->input_states[i]) { + if (state != impl->input_states[i] && impl->block_rstr[i] && !impl->skip_rstr_in[i]) { CeedCallBackend(CeedElemRestrictionApply(impl->block_rstr[i], CEED_NOTRANSPOSE, vec, impl->e_vecs_full[i], request)); - impl->input_states[i] = state; } + impl->input_states[i] = state; // Get evec CeedCallBackend(CeedVectorGetArrayRead(impl->e_vecs_full[i], CEED_MEM_HOST, (const CeedScalar **)&e_data[i])); } else { @@ -242,6 +307,7 @@ static inline int CeedOperatorSetupInputs_Opt(CeedInt num_input_fields, CeedQFun CeedCallBackend(CeedVectorRestoreArrayRead(impl->e_vecs_in[i], (const CeedScalar **)&e_data[i])); } } + CeedCallBackend(CeedVectorDestroy(&vec)); } } return CEED_ERROR_SUCCESS; @@ -254,31 +320,33 @@ static inline int CeedOperatorInputBasis_Opt(CeedInt e, CeedInt Q, CeedQFunction CeedInt num_input_fields, CeedInt block_size, CeedVector in_vec, bool skip_active, CeedScalar *e_data[2 * CEED_FIELD_MAX], CeedOperator_Opt *impl, CeedRequest *request) { for (CeedInt i = 0; i < num_input_fields; i++) { - bool is_active_input = false; + bool is_active; CeedInt elem_size, size, num_comp; CeedEvalMode eval_mode; CeedVector vec; CeedElemRestriction elem_rstr; CeedBasis basis; - CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); // Skip active input - is_active_input = vec == CEED_VECTOR_ACTIVE; - if (skip_active && is_active_input) continue; + CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); + is_active = vec == CEED_VECTOR_ACTIVE; + CeedCallBackend(CeedVectorDestroy(&vec)); + if (skip_active && is_active) continue; // Get elem_size, eval_mode, size CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr)); CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); + CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &size)); // Restrict block active input - if (is_active_input) { + if (is_active && impl->block_rstr[i]) { CeedCallBackend(CeedElemRestrictionApplyBlock(impl->block_rstr[i], e / block_size, CEED_NOTRANSPOSE, in_vec, impl->e_vecs_in[i], request)); } // Basis action switch (eval_mode) { case CEED_EVAL_NONE: - if (!is_active_input) { + if (!is_active) { CeedCallBackend(CeedVectorSetArray(impl->q_vecs_in[i], CEED_MEM_HOST, CEED_USE_POINTER, &e_data[i][(CeedSize)e * Q * size])); } break; @@ -287,11 +355,12 @@ static inline int CeedOperatorInputBasis_Opt(CeedInt e, CeedInt Q, CeedQFunction case CEED_EVAL_DIV: case CEED_EVAL_CURL: CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); - if (!is_active_input) { + if (!is_active) { CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); CeedCallBackend(CeedVectorSetArray(impl->e_vecs_in[i], CEED_MEM_HOST, CEED_USE_POINTER, &e_data[i][(CeedSize)e * elem_size * num_comp])); } CeedCallBackend(CeedBasisApply(basis, block_size, CEED_NOTRANSPOSE, eval_mode, impl->e_vecs_in[i], impl->q_vecs_in[i])); + CeedCallBackend(CeedBasisDestroy(&basis)); break; case CEED_EVAL_WEIGHT: break; // No action @@ -304,16 +373,15 @@ static inline int CeedOperatorInputBasis_Opt(CeedInt e, CeedInt Q, CeedQFunction // Output Basis Action //------------------------------------------------------------------------------ static inline int CeedOperatorOutputBasis_Opt(CeedInt e, CeedInt Q, CeedQFunctionField *qf_output_fields, CeedOperatorField *op_output_fields, - CeedInt block_size, CeedInt num_input_fields, CeedInt num_output_fields, CeedOperator op, - CeedVector out_vec, CeedOperator_Opt *impl, CeedRequest *request) { + CeedInt block_size, CeedInt num_input_fields, CeedInt num_output_fields, bool *apply_add_basis, + bool *skip_rstr, CeedOperator op, CeedVector out_vec, CeedOperator_Opt *impl, CeedRequest *request) { for (CeedInt i = 0; i < num_output_fields; i++) { - CeedEvalMode eval_mode; - CeedVector vec; - CeedElemRestriction elem_rstr; - CeedBasis basis; + bool is_active; + CeedEvalMode eval_mode; + CeedVector vec; + CeedBasis basis; - // Get elem_size, eval_mode, size - CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr)); + // Get eval_mode CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); // Basis action switch (eval_mode) { @@ -324,7 +392,12 @@ static inline int CeedOperatorOutputBasis_Opt(CeedInt e, CeedInt Q, CeedQFunctio case CEED_EVAL_DIV: case CEED_EVAL_CURL: CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); - CeedCallBackend(CeedBasisApply(basis, block_size, CEED_TRANSPOSE, eval_mode, impl->q_vecs_out[i], impl->e_vecs_out[i])); + if (apply_add_basis[i]) { + CeedCallBackend(CeedBasisApplyAdd(basis, block_size, CEED_TRANSPOSE, eval_mode, impl->q_vecs_out[i], impl->e_vecs_out[i])); + } else { + CeedCallBackend(CeedBasisApply(basis, block_size, CEED_TRANSPOSE, eval_mode, impl->q_vecs_out[i], impl->e_vecs_out[i])); + } + CeedCallBackend(CeedBasisDestroy(&basis)); break; // LCOV_EXCL_START case CEED_EVAL_WEIGHT: { @@ -333,12 +406,15 @@ static inline int CeedOperatorOutputBasis_Opt(CeedInt e, CeedInt Q, CeedQFunctio } } // Restrict output block + if (skip_rstr[i]) continue; // Get output vector CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec)); - if (vec == CEED_VECTOR_ACTIVE) vec = out_vec; + is_active = vec == CEED_VECTOR_ACTIVE; + if (is_active) vec = out_vec; // Restrict - CeedCallBackend( - CeedElemRestrictionApplyBlock(impl->block_rstr[i + impl->num_inputs], e / block_size, CEED_TRANSPOSE, impl->e_vecs_out[i], vec, request)); + CeedCallBackend(CeedElemRestrictionApplyBlock(impl->block_rstr[i + impl->num_inputs], e / block_size, CEED_TRANSPOSE, impl->e_vecs_out[i], vec, + request)); + if (!is_active) CeedCallBackend(CeedVectorDestroy(&vec)); } return CEED_ERROR_SUCCESS; } @@ -357,6 +433,7 @@ static inline int CeedOperatorRestoreInputs_Opt(CeedInt num_input_fields, CeedQF if (eval_mode != CEED_EVAL_WEIGHT && vec != CEED_VECTOR_ACTIVE) { CeedCallBackend(CeedVectorRestoreArrayRead(impl->e_vecs_full[i], (const CeedScalar **)&e_data[i])); } + CeedCallBackend(CeedVectorDestroy(&vec)); } return CEED_ERROR_SUCCESS; } @@ -375,20 +452,17 @@ static int CeedOperatorApplyAdd_Opt(CeedOperator op, CeedVector in_vec, CeedVect CeedOperatorField *op_input_fields, *op_output_fields; CeedOperator_Opt *impl; + // Setup + CeedCallBackend(CeedOperatorSetup_Opt(op)); + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); CeedCallBackend(CeedGetData(ceed, &ceed_impl)); + CeedCallBackend(CeedDestroy(&ceed)); CeedCallBackend(CeedOperatorGetData(op, &impl)); CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem)); - CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q)); - CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); - CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); - CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); const CeedInt block_size = ceed_impl->block_size; const CeedInt num_blocks = (num_elem / block_size) + !!(num_elem % block_size); - // Setup - CeedCallBackend(CeedOperatorSetup_Opt(op)); - // Restriction only operator if (impl->is_identity_rstr_op) { for (CeedInt b = 0; b < num_blocks; b++) { @@ -398,6 +472,11 @@ static int CeedOperatorApplyAdd_Opt(CeedOperator op, CeedVector in_vec, CeedVect return CEED_ERROR_SUCCESS; } + CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q)); + CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); + CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); + CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); + // Input Evecs and Restriction CeedCallBackend(CeedOperatorSetupInputs_Opt(num_input_fields, qf_input_fields, op_input_fields, in_vec, e_data, impl, request)); @@ -416,8 +495,8 @@ static int CeedOperatorApplyAdd_Opt(CeedOperator op, CeedVector in_vec, CeedVect // Loop through elements for (CeedInt e = 0; e < num_blocks * block_size; e += block_size) { // Input basis apply - CeedCallBackend( - CeedOperatorInputBasis_Opt(e, Q, qf_input_fields, op_input_fields, num_input_fields, block_size, in_vec, false, e_data, impl, request)); + CeedCallBackend(CeedOperatorInputBasis_Opt(e, Q, qf_input_fields, op_input_fields, num_input_fields, block_size, in_vec, false, e_data, impl, + request)); // Q function if (!impl->is_identity_qf) { @@ -425,12 +504,13 @@ static int CeedOperatorApplyAdd_Opt(CeedOperator op, CeedVector in_vec, CeedVect } // Output basis apply and restriction - CeedCallBackend(CeedOperatorOutputBasis_Opt(e, Q, qf_output_fields, op_output_fields, block_size, num_input_fields, num_output_fields, op, - out_vec, impl, request)); + CeedCallBackend(CeedOperatorOutputBasis_Opt(e, Q, qf_output_fields, op_output_fields, block_size, num_input_fields, num_output_fields, + impl->apply_add_basis_out, impl->skip_rstr_out, op, out_vec, impl, request)); } // Restore input arrays CeedCallBackend(CeedOperatorRestoreInputs_Opt(num_input_fields, qf_input_fields, op_input_fields, e_data, impl)); + CeedCallBackend(CeedQFunctionDestroy(&qf)); return CEED_ERROR_SUCCESS; } @@ -441,8 +521,7 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Opt(CeedOperator op, b CeedRequest *request) { Ceed ceed; Ceed_Opt *ceed_impl; - CeedSize q_size; - CeedInt Q, num_input_fields, num_output_fields, num_elem, size; + CeedInt qf_size_in, qf_size_out, Q, num_input_fields, num_output_fields, num_elem; CeedScalar *l_vec_array, *e_data[2 * CEED_FIELD_MAX] = {0}; CeedQFunctionField *qf_input_fields, *qf_output_fields; CeedQFunction qf; @@ -452,16 +531,17 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Opt(CeedOperator op, b CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); CeedCallBackend(CeedGetData(ceed, &ceed_impl)); CeedCallBackend(CeedOperatorGetData(op, &impl)); + qf_size_in = impl->qf_size_in; + qf_size_out = impl->qf_size_out; + CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem)); CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q)); CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); - const CeedInt block_size = ceed_impl->block_size; - const CeedInt num_blocks = (num_elem / block_size) + !!(num_elem % block_size); - CeedInt num_active_in = impl->num_active_in, num_active_out = impl->num_active_out; + const CeedInt block_size = ceed_impl->block_size; + const CeedInt num_blocks = (num_elem / block_size) + !!(num_elem % block_size); CeedVector l_vec = impl->qf_l_vec; - CeedVector *active_in = impl->qf_active_in; CeedElemRestriction block_rstr = impl->qf_block_rstr; // Setup @@ -474,55 +554,45 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Opt(CeedOperator op, b CeedCallBackend(CeedOperatorSetupInputs_Opt(num_input_fields, qf_input_fields, op_input_fields, NULL, e_data, impl, request)); // Count number of active input fields - if (!num_active_in) { + if (qf_size_in == 0) { for (CeedInt i = 0; i < num_input_fields; i++) { - CeedScalar *q_vec_array; - CeedVector vec; + CeedInt field_size; + CeedVector vec; - // Get input vector - CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); // Check if active input + CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); if (vec == CEED_VECTOR_ACTIVE) { - CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &size)); + CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &field_size)); CeedCallBackend(CeedVectorSetValue(impl->q_vecs_in[i], 0.0)); - CeedCallBackend(CeedVectorGetArray(impl->q_vecs_in[i], CEED_MEM_HOST, &q_vec_array)); - CeedCallBackend(CeedRealloc(num_active_in + size, &active_in)); - for (CeedInt field = 0; field < size; field++) { - q_size = (CeedSize)Q * block_size; - CeedCallBackend(CeedVectorCreate(ceed, q_size, &active_in[num_active_in + field])); - CeedCallBackend( - CeedVectorSetArray(active_in[num_active_in + field], CEED_MEM_HOST, CEED_USE_POINTER, &q_vec_array[field * Q * block_size])); - } - num_active_in += size; - CeedCallBackend(CeedVectorRestoreArray(impl->q_vecs_in[i], &q_vec_array)); + qf_size_in += field_size; } + CeedCallBackend(CeedVectorDestroy(&vec)); } - impl->num_active_in = num_active_in; - impl->qf_active_in = active_in; + CeedCheck(qf_size_in > 0, ceed, CEED_ERROR_BACKEND, "Cannot assemble QFunction without active inputs and outputs"); + impl->qf_size_in = qf_size_in; } // Count number of active output fields - if (!num_active_out) { + if (qf_size_out == 0) { for (CeedInt i = 0; i < num_output_fields; i++) { + CeedInt field_size; CeedVector vec; - // Get output vector - CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec)); // Check if active output + CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec)); if (vec == CEED_VECTOR_ACTIVE) { - CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &size)); - num_active_out += size; + CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &field_size)); + qf_size_out += field_size; } + CeedCallBackend(CeedVectorDestroy(&vec)); } - impl->num_active_out = num_active_out; + CeedCheck(qf_size_out > 0, ceed, CEED_ERROR_BACKEND, "Cannot assemble QFunction without active inputs and outputs"); + impl->qf_size_out = qf_size_out; } - // Check sizes - CeedCheck(num_active_in > 0 && num_active_out > 0, ceed, CEED_ERROR_BACKEND, "Cannot assemble QFunction without active inputs and outputs"); - // Setup l_vec if (!l_vec) { - const CeedSize l_size = (CeedSize)block_size * Q * num_active_in * num_active_out; + const CeedSize l_size = (CeedSize)block_size * Q * qf_size_in * qf_size_out; CeedCallBackend(CeedVectorCreate(ceed, l_size, &l_vec)); CeedCallBackend(CeedVectorSetValue(l_vec, 0.0)); @@ -531,21 +601,21 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Opt(CeedOperator op, b // Output blocked restriction if (!block_rstr) { - CeedInt strides[3] = {1, Q, num_active_in * num_active_out * Q}; + CeedInt strides[3] = {1, Q, qf_size_in * qf_size_out * Q}; - CeedCallBackend(CeedElemRestrictionCreateBlockedStrided(ceed, num_elem, Q, block_size, num_active_in * num_active_out, - num_active_in * num_active_out * num_elem * Q, strides, &block_rstr)); + CeedCallBackend(CeedElemRestrictionCreateBlockedStrided(ceed, num_elem, Q, block_size, qf_size_in * qf_size_out, + qf_size_in * qf_size_out * num_elem * Q, strides, &block_rstr)); impl->qf_block_rstr = block_rstr; } // Build objects if needed if (build_objects) { - const CeedSize l_size = (CeedSize)num_elem * Q * num_active_in * num_active_out; - CeedInt strides[3] = {1, Q, num_active_in * num_active_out * Q}; + const CeedSize l_size = (CeedSize)num_elem * Q * qf_size_in * qf_size_out; + CeedInt strides[3] = {1, Q, qf_size_in * qf_size_out * Q}; // Create output restriction - CeedCallBackend(CeedElemRestrictionCreateStrided(ceed, num_elem, Q, num_active_in * num_active_out, num_active_in * num_active_out * num_elem * Q, - strides, rstr)); + CeedCallBackend(CeedElemRestrictionCreateStrided(ceed, num_elem, Q, qf_size_in * qf_size_out, + (CeedSize)qf_size_in * (CeedSize)qf_size_out * (CeedSize)num_elem * (CeedSize)Q, strides, rstr)); // Create assembled vector CeedCallBackend(CeedVectorCreate(ceed, l_size, assembled)); } @@ -556,55 +626,82 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Opt(CeedOperator op, b CeedCallBackend(CeedVectorGetArray(l_vec, CEED_MEM_HOST, &l_vec_array)); // Input basis apply - CeedCallBackend( - CeedOperatorInputBasis_Opt(e, Q, qf_input_fields, op_input_fields, num_input_fields, block_size, NULL, true, e_data, impl, request)); + CeedCallBackend(CeedOperatorInputBasis_Opt(e, Q, qf_input_fields, op_input_fields, num_input_fields, block_size, NULL, true, e_data, impl, + request)); // Assemble QFunction - for (CeedInt in = 0; in < num_active_in; in++) { - // Set Inputs - CeedCallBackend(CeedVectorSetValue(active_in[in], 1.0)); - if (num_active_in > 1) { - CeedCallBackend(CeedVectorSetValue(active_in[(in + num_active_in - 1) % num_active_in], 0.0)); - } - if (!impl->is_identity_qf) { - // Set Outputs - for (CeedInt out = 0; out < num_output_fields; out++) { - CeedVector vec; - - // Get output vector - CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[out], &vec)); - // Check if active output - if (vec == CEED_VECTOR_ACTIVE) { - CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[out], CEED_MEM_HOST, CEED_USE_POINTER, l_vec_array)); - CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[out], &size)); - l_vec_array += size * Q * block_size; // Advance the pointer by the size of the output + for (CeedInt i = 0; i < num_input_fields; i++) { + bool is_active; + CeedInt field_size; + CeedVector vec; + + // Check if active input + CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); + is_active = vec == CEED_VECTOR_ACTIVE; + CeedCallBackend(CeedVectorDestroy(&vec)); + if (!is_active) continue; + CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &field_size)); + for (CeedInt field = 0; field < field_size; field++) { + // Set current portion of input to 1.0 + { + CeedScalar *array; + + CeedCallBackend(CeedVectorGetArray(impl->q_vecs_in[i], CEED_MEM_HOST, &array)); + for (CeedInt j = 0; j < Q * block_size; j++) array[field * Q * block_size + j] = 1.0; + CeedCallBackend(CeedVectorRestoreArray(impl->q_vecs_in[i], &array)); + } + + if (!impl->is_identity_qf) { + // Set Outputs + for (CeedInt out = 0; out < num_output_fields; out++) { + CeedVector vec; + + // Check if active output + CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[out], &vec)); + if (vec == CEED_VECTOR_ACTIVE) { + CeedInt field_size; + + CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[out], CEED_MEM_HOST, CEED_USE_POINTER, l_vec_array)); + CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[out], &field_size)); + l_vec_array += field_size * Q * block_size; // Advance the pointer by the size of the output + } + CeedCallBackend(CeedVectorDestroy(&vec)); } + // Apply QFunction + CeedCallBackend(CeedQFunctionApply(qf, Q * block_size, impl->q_vecs_in, impl->q_vecs_out)); + } else { + CeedInt field_size; + const CeedScalar *array; + + // Copy Identity Outputs + CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[0], &field_size)); + CeedCallBackend(CeedVectorGetArrayRead(impl->q_vecs_out[0], CEED_MEM_HOST, &array)); + for (CeedInt j = 0; j < field_size * Q * block_size; j++) l_vec_array[j] = array[j]; + CeedCallBackend(CeedVectorRestoreArrayRead(impl->q_vecs_out[0], &array)); + l_vec_array += field_size * Q * block_size; + } + // Reset input to 0.0 + { + CeedScalar *array; + + CeedCallBackend(CeedVectorGetArray(impl->q_vecs_in[i], CEED_MEM_HOST, &array)); + for (CeedInt j = 0; j < Q * block_size; j++) array[field * Q * block_size + j] = 0.0; + CeedCallBackend(CeedVectorRestoreArray(impl->q_vecs_in[i], &array)); } - // Apply QFunction - CeedCallBackend(CeedQFunctionApply(qf, Q * block_size, impl->q_vecs_in, impl->q_vecs_out)); - } else { - const CeedScalar *q_vec_array; - - // Copy Identity Outputs - CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[0], &size)); - CeedCallBackend(CeedVectorGetArrayRead(impl->q_vecs_out[0], CEED_MEM_HOST, &q_vec_array)); - for (CeedInt i = 0; i < size * Q * block_size; i++) l_vec_array[i] = q_vec_array[i]; - CeedCallBackend(CeedVectorRestoreArrayRead(impl->q_vecs_out[0], &q_vec_array)); - l_vec_array += size * Q * block_size; } } - // Assemble QFunction + // Un-set output Qvecs to prevent accidental overwrite of Assembled if (!impl->is_identity_qf) { for (CeedInt out = 0; out < num_output_fields; out++) { CeedVector vec; - // Get output vector - CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[out], &vec)); // Check if active output + CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[out], &vec)); if (vec == CEED_VECTOR_ACTIVE && num_elem > 0) { CeedCallBackend(CeedVectorTakeArray(impl->q_vecs_out[out], CEED_MEM_HOST, NULL)); } + CeedCallBackend(CeedVectorDestroy(&vec)); } } @@ -613,18 +710,20 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Opt(CeedOperator op, b CeedCallBackend(CeedElemRestrictionApplyBlock(block_rstr, e / block_size, CEED_TRANSPOSE, l_vec, *assembled, request)); } - // Un-set output Qvecs to prevent accidental overwrite of Assembled + // Reset output Qvecs for (CeedInt out = 0; out < num_output_fields; out++) { CeedVector vec; - // Get output vector - CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[out], &vec)); // Initialize array if active output + CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[out], &vec)); if (vec == CEED_VECTOR_ACTIVE) CeedCallBackend(CeedVectorSetValue(impl->q_vecs_out[out], 0.0)); + CeedCallBackend(CeedVectorDestroy(&vec)); } // Restore input arrays CeedCallBackend(CeedOperatorRestoreInputs_Opt(num_input_fields, qf_input_fields, op_input_fields, e_data, impl)); + CeedCallBackend(CeedDestroy(&ceed)); + CeedCallBackend(CeedQFunctionDestroy(&qf)); return CEED_ERROR_SUCCESS; } @@ -656,6 +755,9 @@ static int CeedOperatorDestroy_Opt(CeedOperator op) { CeedCallBackend(CeedFree(&impl->block_rstr)); CeedCallBackend(CeedFree(&impl->e_vecs_full)); CeedCallBackend(CeedFree(&impl->input_states)); + CeedCallBackend(CeedFree(&impl->skip_rstr_in)); + CeedCallBackend(CeedFree(&impl->skip_rstr_out)); + CeedCallBackend(CeedFree(&impl->apply_add_basis_out)); for (CeedInt i = 0; i < impl->num_inputs; i++) { CeedCallBackend(CeedVectorDestroy(&impl->e_vecs_in[i])); @@ -672,10 +774,6 @@ static int CeedOperatorDestroy_Opt(CeedOperator op) { CeedCallBackend(CeedFree(&impl->q_vecs_out)); // QFunction assembly data - for (CeedInt i = 0; i < impl->num_active_in; i++) { - CeedCallBackend(CeedVectorDestroy(&impl->qf_active_in[i])); - } - CeedCallBackend(CeedFree(&impl->qf_active_in)); CeedCallBackend(CeedVectorDestroy(&impl->qf_l_vec)); CeedCallBackend(CeedElemRestrictionDestroy(&impl->qf_block_rstr)); @@ -704,6 +802,7 @@ int CeedOperatorCreate_Opt(CeedOperator op) { CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunctionUpdate", CeedOperatorLinearAssembleQFunctionUpdate_Opt)); CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "ApplyAdd", CeedOperatorApplyAdd_Opt)); CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "Destroy", CeedOperatorDestroy_Opt)); + CeedCallBackend(CeedDestroy(&ceed)); return CEED_ERROR_SUCCESS; } diff --git a/backends/opt/ceed-opt-serial.c b/backends/opt/ceed-opt-serial.c index ac506a4ec6..1e3517b44a 100644 --- a/backends/opt/ceed-opt-serial.c +++ b/backends/opt/ceed-opt-serial.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -37,6 +37,7 @@ static int CeedInit_Opt_Serial(const char *resource, Ceed ceed) { // Create reference Ceed that implementation will be dispatched through unless overridden CeedCallBackend(CeedInit("/cpu/self/ref/serial", &ceed_ref)); CeedCallBackend(CeedSetDelegate(ceed, ceed_ref)); + CeedCallBackend(CeedDestroy(&ceed_ref)); CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "Destroy", CeedDestroy_Opt)); CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "TensorContractCreate", CeedTensorContractCreate_Opt)); diff --git a/backends/opt/ceed-opt-tensor.c b/backends/opt/ceed-opt-tensor.c index a8f5335e35..ee41dce029 100644 --- a/backends/opt/ceed-opt-tensor.c +++ b/backends/opt/ceed-opt-tensor.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause diff --git a/backends/opt/ceed-opt.h b/backends/opt/ceed-opt.h index 9e12e612bf..a1b67a58e4 100644 --- a/backends/opt/ceed-opt.h +++ b/backends/opt/ceed-opt.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -21,6 +21,7 @@ typedef struct { typedef struct { bool is_identity_qf, is_identity_rstr_op; + bool *skip_rstr_in, *skip_rstr_out, *apply_add_basis_out; CeedElemRestriction *block_rstr; /* Blocked versions of restrictions */ CeedVector *e_vecs_full; /* Full E-vectors, inputs followed by outputs */ uint64_t *input_states; /* State counter of inputs */ @@ -29,8 +30,7 @@ typedef struct { CeedVector *q_vecs_in; /* Element block input Q-vectors */ CeedVector *q_vecs_out; /* Element block output Q-vectors */ CeedInt num_inputs, num_outputs; - CeedInt num_active_in, num_active_out; - CeedVector *qf_active_in; + CeedInt qf_size_in, qf_size_out; CeedVector qf_l_vec; CeedElemRestriction qf_block_rstr; } CeedOperator_Opt; diff --git a/backends/ref/ceed-ref-basis.c b/backends/ref/ceed-ref-basis.c index b82e8bb278..d8eef6ce98 100644 --- a/backends/ref/ceed-ref-basis.c +++ b/backends/ref/ceed-ref-basis.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -16,17 +16,15 @@ //------------------------------------------------------------------------------ // Basis Apply //------------------------------------------------------------------------------ -static int CeedBasisApply_Ref(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector U, CeedVector V) { - Ceed ceed; - bool is_tensor_basis; +static int CeedBasisApplyCore_Ref(CeedBasis basis, bool apply_add, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector U, + CeedVector V) { + bool is_tensor_basis, add = apply_add || (t_mode == CEED_TRANSPOSE); CeedInt dim, num_comp, q_comp, num_nodes, num_qpts; - const CeedInt add = (t_mode == CEED_TRANSPOSE); const CeedScalar *u; CeedScalar *v; CeedTensorContract contract; CeedBasis_Ref *impl; - CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); CeedCallBackend(CeedBasisGetData(basis, &impl)); CeedCallBackend(CeedBasisGetDimension(basis, &dim)); CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); @@ -35,14 +33,16 @@ static int CeedBasisApply_Ref(CeedBasis basis, CeedInt num_elem, CeedTransposeMo CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &num_qpts)); CeedCallBackend(CeedBasisGetTensorContract(basis, &contract)); if (U != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(U, CEED_MEM_HOST, &u)); - else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); - CeedCallBackend(CeedVectorGetArrayWrite(V, CEED_MEM_HOST, &v)); - + else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, CeedBasisReturnCeed(basis), CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); // Clear v if operating in transpose - if (t_mode == CEED_TRANSPOSE) { - const CeedInt v_size = num_elem * num_comp * num_nodes; + if (apply_add) CeedCallBackend(CeedVectorGetArray(V, CEED_MEM_HOST, &v)); + else CeedCallBackend(CeedVectorGetArrayWrite(V, CEED_MEM_HOST, &v)); + + if (t_mode == CEED_TRANSPOSE && !apply_add) { + CeedSize len; - for (CeedInt i = 0; i < v_size; i++) v[i] = (CeedScalar)0.0; + CeedCallBackend(CeedVectorGetLength(V, &len)); + for (CeedInt i = 0; i < len; i++) v[i] = 0.0; } CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor_basis)); @@ -55,7 +55,7 @@ static int CeedBasisApply_Ref(CeedBasis basis, CeedInt num_elem, CeedTransposeMo switch (eval_mode) { // Interpolate to/from quadrature points case CEED_EVAL_INTERP: { - if (impl->has_collo_interp) { + if (impl->is_collocated) { memcpy(v, u, num_elem * num_comp * num_nodes * sizeof(u[0])); } else { CeedInt P = P_1d, Q = Q_1d; @@ -101,8 +101,8 @@ static int CeedBasisApply_Ref(CeedBasis basis, CeedInt num_elem, CeedTransposeMo // or Grad to quadrature points (Transpose) for (CeedInt d = 0; d < dim; d++) { CeedCallBackend(CeedTensorContractApply(contract, pre, P, post, Q, (t_mode == CEED_NOTRANSPOSE ? interp_1d : impl->collo_grad_1d), t_mode, - add && (d > 0), - (t_mode == CEED_NOTRANSPOSE ? (d == 0 ? u : tmp[d % 2]) : u + d * num_qpts * num_comp * num_elem), + (t_mode == CEED_TRANSPOSE) && (d > 0), + (t_mode == CEED_NOTRANSPOSE ? (d == 0 ? u : tmp[d % 2]) : &u[d * num_qpts * num_comp * num_elem]), (t_mode == CEED_NOTRANSPOSE ? (d == dim - 1 ? interp : tmp[(d + 1) % 2]) : interp))); pre /= P; post *= Q; @@ -116,14 +116,15 @@ static int CeedBasisApply_Ref(CeedBasis basis, CeedInt num_elem, CeedTransposeMo } pre = num_comp * CeedIntPow(P, dim - 1), post = num_elem; for (CeedInt d = 0; d < dim; d++) { - CeedCallBackend(CeedTensorContractApply( - contract, pre, P, post, Q, (t_mode == CEED_NOTRANSPOSE ? impl->collo_grad_1d : interp_1d), t_mode, add && (d == dim - 1), - (t_mode == CEED_NOTRANSPOSE ? interp : (d == 0 ? interp : tmp[d % 2])), - (t_mode == CEED_NOTRANSPOSE ? v + d * num_qpts * num_comp * num_elem : (d == dim - 1 ? v : tmp[(d + 1) % 2])))); + CeedCallBackend(CeedTensorContractApply(contract, pre, P, post, Q, (t_mode == CEED_NOTRANSPOSE ? impl->collo_grad_1d : interp_1d), t_mode, + (t_mode == CEED_NOTRANSPOSE && apply_add) || (t_mode == CEED_TRANSPOSE && (d == dim - 1)), + (t_mode == CEED_NOTRANSPOSE ? interp : (d == 0 ? interp : tmp[d % 2])), + (t_mode == CEED_NOTRANSPOSE ? &v[d * num_qpts * num_comp * num_elem] + : (d == dim - 1 ? v : tmp[(d + 1) % 2])))); pre /= P; post *= Q; } - } else if (impl->has_collo_interp) { // Qpts collocated with nodes + } else if (impl->is_collocated) { // Qpts collocated with nodes const CeedScalar *grad_1d; CeedCallBackend(CeedBasisGetGrad1D(basis, &grad_1d)); @@ -133,8 +134,8 @@ static int CeedBasisApply_Ref(CeedBasis basis, CeedInt num_elem, CeedTransposeMo for (CeedInt d = 0; d < dim; d++) { CeedCallBackend(CeedTensorContractApply(contract, pre, P, post, Q, grad_1d, t_mode, add && (d > 0), - t_mode == CEED_NOTRANSPOSE ? u : u + d * num_comp * num_qpts * num_elem, - t_mode == CEED_TRANSPOSE ? v : v + d * num_comp * num_qpts * num_elem)); + t_mode == CEED_NOTRANSPOSE ? u : &u[d * num_comp * num_qpts * num_elem], + t_mode == CEED_TRANSPOSE ? v : &v[d * num_comp * num_qpts * num_elem])); pre /= P; post *= Q; } @@ -156,8 +157,8 @@ static int CeedBasisApply_Ref(CeedBasis basis, CeedInt num_elem, CeedTransposeMo for (CeedInt d = 0; d < dim; d++) { CeedCallBackend(CeedTensorContractApply( contract, pre, P, post, Q, (p == d) ? grad_1d : interp_1d, t_mode, add && (d == dim - 1), - (d == 0 ? (t_mode == CEED_NOTRANSPOSE ? u : u + p * num_comp * num_qpts * num_elem) : tmp[d % 2]), - (d == dim - 1 ? (t_mode == CEED_TRANSPOSE ? v : v + p * num_comp * num_qpts * num_elem) : tmp[(d + 1) % 2]))); + (d == 0 ? (t_mode == CEED_NOTRANSPOSE ? u : &u[p * num_comp * num_qpts * num_elem]) : tmp[d % 2]), + (d == dim - 1 ? (t_mode == CEED_TRANSPOSE ? v : &v[p * num_comp * num_qpts * num_elem]) : tmp[(d + 1) % 2]))); pre /= P; post *= Q; } @@ -169,7 +170,7 @@ static int CeedBasisApply_Ref(CeedBasis basis, CeedInt num_elem, CeedTransposeMo CeedInt Q = Q_1d; const CeedScalar *q_weight_1d; - CeedCheck(t_mode == CEED_NOTRANSPOSE, ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT incompatible with CEED_TRANSPOSE"); + CeedCheck(t_mode == CEED_NOTRANSPOSE, CeedBasisReturnCeed(basis), CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT incompatible with CEED_TRANSPOSE"); CeedCallBackend(CeedBasisGetQWeights(basis, &q_weight_1d)); for (CeedInt d = 0; d < dim; d++) { CeedInt pre = CeedIntPow(Q, dim - d - 1), post = CeedIntPow(Q, d); @@ -188,9 +189,9 @@ static int CeedBasisApply_Ref(CeedBasis basis, CeedInt num_elem, CeedTransposeMo // LCOV_EXCL_START case CEED_EVAL_DIV: case CEED_EVAL_CURL: - return CeedError(ceed, CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]); + return CeedError(CeedBasisReturnCeed(basis), CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]); case CEED_EVAL_NONE: - return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_NONE does not make sense in this context"); + return CeedError(CeedBasisReturnCeed(basis), CEED_ERROR_BACKEND, "CEED_EVAL_NONE does not make sense in this context"); // LCOV_EXCL_STOP } } else { @@ -230,7 +231,7 @@ static int CeedBasisApply_Ref(CeedBasis basis, CeedInt num_elem, CeedTransposeMo case CEED_EVAL_WEIGHT: { const CeedScalar *q_weight; - CeedCheck(t_mode == CEED_NOTRANSPOSE, ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT incompatible with CEED_TRANSPOSE"); + CeedCheck(t_mode == CEED_NOTRANSPOSE, CeedBasisReturnCeed(basis), CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT incompatible with CEED_TRANSPOSE"); CeedCallBackend(CeedBasisGetQWeights(basis, &q_weight)); for (CeedInt i = 0; i < num_qpts; i++) { for (CeedInt e = 0; e < num_elem; e++) v[i * num_elem + e] = q_weight[i]; @@ -238,7 +239,7 @@ static int CeedBasisApply_Ref(CeedBasis basis, CeedInt num_elem, CeedTransposeMo } break; // LCOV_EXCL_START case CEED_EVAL_NONE: - return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_NONE does not make sense in this context"); + return CeedError(CeedBasisReturnCeed(basis), CEED_ERROR_BACKEND, "CEED_EVAL_NONE does not make sense in this context"); // LCOV_EXCL_STOP } } @@ -249,6 +250,16 @@ static int CeedBasisApply_Ref(CeedBasis basis, CeedInt num_elem, CeedTransposeMo return CEED_ERROR_SUCCESS; } +static int CeedBasisApply_Ref(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector U, CeedVector V) { + CeedCallBackend(CeedBasisApplyCore_Ref(basis, false, num_elem, t_mode, eval_mode, U, V)); + return CEED_ERROR_SUCCESS; +} + +static int CeedBasisApplyAdd_Ref(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector U, CeedVector V) { + CeedCallBackend(CeedBasisApplyCore_Ref(basis, true, num_elem, t_mode, eval_mode, U, V)); + return CEED_ERROR_SUCCESS; +} + //------------------------------------------------------------------------------ // Basis Destroy Tensor //------------------------------------------------------------------------------ @@ -274,20 +285,9 @@ int CeedBasisCreateTensorH1_Ref(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const C CeedCallBackend(CeedGetParent(ceed, &ceed_parent)); CeedCallBackend(CeedCalloc(1, &impl)); - // Check for collocated interp - if (Q_1d == P_1d) { - bool has_collocated = true; - - for (CeedInt i = 0; i < P_1d; i++) { - has_collocated = has_collocated && (fabs(interp_1d[i + P_1d * i] - 1.0) < 1e-14); - for (CeedInt j = 0; j < P_1d; j++) { - if (j != i) has_collocated = has_collocated && (fabs(interp_1d[j + P_1d * i]) < 1e-14); - } - } - impl->has_collo_interp = has_collocated; - } // Calculate collocated grad - if (Q_1d >= P_1d && !impl->has_collo_interp) { + CeedCallBackend(CeedBasisIsCollocated(basis, &impl->is_collocated)); + if (Q_1d >= P_1d && !impl->is_collocated) { CeedCallBackend(CeedMalloc(Q_1d * Q_1d, &impl->collo_grad_1d)); CeedCallBackend(CeedBasisGetCollocatedGrad(basis, impl->collo_grad_1d)); } @@ -295,9 +295,13 @@ int CeedBasisCreateTensorH1_Ref(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const C CeedCallBackend(CeedTensorContractCreate(ceed_parent, &contract)); CeedCallBackend(CeedBasisSetTensorContract(basis, contract)); + CeedCallBackend(CeedTensorContractDestroy(&contract)); CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApply_Ref)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAdd_Ref)); CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyTensor_Ref)); + CeedCallBackend(CeedDestroy(&ceed)); + CeedCallBackend(CeedDestroy(&ceed_parent)); return CEED_ERROR_SUCCESS; } @@ -314,8 +318,12 @@ int CeedBasisCreateH1_Ref(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedCallBackend(CeedTensorContractCreate(ceed_parent, &contract)); CeedCallBackend(CeedBasisSetTensorContract(basis, contract)); + CeedCallBackend(CeedTensorContractDestroy(&contract)); CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApply_Ref)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAdd_Ref)); + CeedCallBackend(CeedDestroy(&ceed)); + CeedCallBackend(CeedDestroy(&ceed_parent)); return CEED_ERROR_SUCCESS; } @@ -332,8 +340,12 @@ int CeedBasisCreateHdiv_Ref(CeedElemTopology topo, CeedInt dim, CeedInt num_node CeedCallBackend(CeedTensorContractCreate(ceed_parent, &contract)); CeedCallBackend(CeedBasisSetTensorContract(basis, contract)); + CeedCallBackend(CeedTensorContractDestroy(&contract)); CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApply_Ref)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAdd_Ref)); + CeedCallBackend(CeedDestroy(&ceed)); + CeedCallBackend(CeedDestroy(&ceed_parent)); return CEED_ERROR_SUCCESS; } @@ -350,8 +362,12 @@ int CeedBasisCreateHcurl_Ref(CeedElemTopology topo, CeedInt dim, CeedInt num_nod CeedCallBackend(CeedTensorContractCreate(ceed_parent, &contract)); CeedCallBackend(CeedBasisSetTensorContract(basis, contract)); + CeedCallBackend(CeedTensorContractDestroy(&contract)); CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApply_Ref)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAdd_Ref)); + CeedCallBackend(CeedDestroy(&ceed)); + CeedCallBackend(CeedDestroy(&ceed_parent)); return CEED_ERROR_SUCCESS; } diff --git a/backends/ref/ceed-ref-operator.c b/backends/ref/ceed-ref-operator.c index f525460ea6..326ff93e61 100644 --- a/backends/ref/ceed-ref-operator.c +++ b/backends/ref/ceed-ref-operator.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -16,8 +16,9 @@ //------------------------------------------------------------------------------ // Setup Input/Output Fields //------------------------------------------------------------------------------ -static int CeedOperatorSetupFields_Ref(CeedQFunction qf, CeedOperator op, bool is_input, CeedVector *e_vecs_full, CeedVector *e_vecs, - CeedVector *q_vecs, CeedInt start_e, CeedInt num_fields, CeedInt Q) { +static int CeedOperatorSetupFields_Ref(CeedQFunction qf, CeedOperator op, bool is_input, bool *skip_rstr, CeedInt *e_data_out_indices, + bool *apply_add_basis, CeedVector *e_vecs_full, CeedVector *e_vecs, CeedVector *q_vecs, CeedInt start_e, + CeedInt num_fields, CeedInt Q) { Ceed ceed; CeedSize e_size, q_size; CeedInt num_comp, size, P; @@ -29,7 +30,8 @@ static int CeedOperatorSetupFields_Ref(CeedQFunction qf, CeedOperator op, bool i CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); CeedCallBackend(CeedGetParent(ceed, &ceed_parent)); - if (ceed_parent) ceed = ceed_parent; + CeedCallBackend(CeedReferenceCopy(ceed_parent, &ceed)); + CeedCallBackend(CeedDestroy(&ceed_parent)); } if (is_input) { CeedCallBackend(CeedOperatorGetFields(op, NULL, &op_fields, NULL, NULL)); @@ -49,6 +51,7 @@ static int CeedOperatorSetupFields_Ref(CeedQFunction qf, CeedOperator op, bool i if (eval_mode != CEED_EVAL_WEIGHT) { CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &elem_rstr)); CeedCallBackend(CeedElemRestrictionCreateVector(elem_rstr, NULL, &e_vecs_full[i + start_e])); + CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); } switch (eval_mode) { @@ -69,15 +72,70 @@ static int CeedOperatorSetupFields_Ref(CeedQFunction qf, CeedOperator op, bool i CeedCallBackend(CeedVectorCreate(ceed, e_size, &e_vecs[i])); q_size = (CeedSize)Q * size; CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i])); + CeedCallBackend(CeedBasisDestroy(&basis)); break; case CEED_EVAL_WEIGHT: // Only on input fields CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis)); q_size = (CeedSize)Q; CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i])); CeedCallBackend(CeedBasisApply(basis, 1, CEED_NOTRANSPOSE, CEED_EVAL_WEIGHT, CEED_VECTOR_NONE, q_vecs[i])); + CeedCallBackend(CeedBasisDestroy(&basis)); break; } } + // Drop duplicate restrictions + if (is_input) { + for (CeedInt i = 0; i < num_fields; i++) { + CeedVector vec_i; + CeedElemRestriction rstr_i; + + CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec_i)); + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &rstr_i)); + for (CeedInt j = i + 1; j < num_fields; j++) { + CeedVector vec_j; + CeedElemRestriction rstr_j; + + CeedCallBackend(CeedOperatorFieldGetVector(op_fields[j], &vec_j)); + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[j], &rstr_j)); + if (vec_i == vec_j && rstr_i == rstr_j) { + CeedCallBackend(CeedVectorReferenceCopy(e_vecs[i], &e_vecs[j])); + CeedCallBackend(CeedVectorReferenceCopy(e_vecs_full[i + start_e], &e_vecs_full[j + start_e])); + skip_rstr[j] = true; + } + CeedCallBackend(CeedVectorDestroy(&vec_j)); + CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j)); + } + CeedCallBackend(CeedVectorDestroy(&vec_i)); + CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i)); + } + } else { + for (CeedInt i = num_fields - 1; i >= 0; i--) { + CeedVector vec_i; + CeedElemRestriction rstr_i; + + CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec_i)); + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &rstr_i)); + for (CeedInt j = i - 1; j >= 0; j--) { + CeedVector vec_j; + CeedElemRestriction rstr_j; + + CeedCallBackend(CeedOperatorFieldGetVector(op_fields[j], &vec_j)); + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[j], &rstr_j)); + if (vec_i == vec_j && rstr_i == rstr_j) { + CeedCallBackend(CeedVectorReferenceCopy(e_vecs[i], &e_vecs[j])); + CeedCallBackend(CeedVectorReferenceCopy(e_vecs_full[i + start_e], &e_vecs_full[j + start_e])); + skip_rstr[j] = true; + apply_add_basis[i] = true; + e_data_out_indices[j] = i; + } + CeedCallBackend(CeedVectorDestroy(&vec_j)); + CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j)); + } + CeedCallBackend(CeedVectorDestroy(&vec_i)); + CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i)); + } + } + CeedCallBackend(CeedDestroy(&ceed)); return CEED_ERROR_SUCCESS; } @@ -105,6 +163,10 @@ static int CeedOperatorSetup_Ref(CeedOperator op) { // Allocate CeedCallBackend(CeedCalloc(num_input_fields + num_output_fields, &impl->e_vecs_full)); + CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->skip_rstr_in)); + CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->skip_rstr_out)); + CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->e_data_out_indices)); + CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->apply_add_basis_out)); CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->input_states)); CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->e_vecs_in)); CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->e_vecs_out)); @@ -116,10 +178,11 @@ static int CeedOperatorSetup_Ref(CeedOperator op) { // Set up infield and outfield e_vecs and q_vecs // Infields - CeedCallBackend(CeedOperatorSetupFields_Ref(qf, op, true, impl->e_vecs_full, impl->e_vecs_in, impl->q_vecs_in, 0, num_input_fields, Q)); + CeedCallBackend(CeedOperatorSetupFields_Ref(qf, op, true, impl->skip_rstr_in, NULL, NULL, impl->e_vecs_full, impl->e_vecs_in, impl->q_vecs_in, 0, + num_input_fields, Q)); // Outfields - CeedCallBackend( - CeedOperatorSetupFields_Ref(qf, op, false, impl->e_vecs_full, impl->e_vecs_out, impl->q_vecs_out, num_input_fields, num_output_fields, Q)); + CeedCallBackend(CeedOperatorSetupFields_Ref(qf, op, false, impl->skip_rstr_out, impl->e_data_out_indices, impl->apply_add_basis_out, + impl->e_vecs_full, impl->e_vecs_out, impl->q_vecs_out, num_input_fields, num_output_fields, Q)); // Identity QFunctions if (impl->is_identity_qf) { @@ -138,6 +201,7 @@ static int CeedOperatorSetup_Ref(CeedOperator op) { } CeedCallBackend(CeedOperatorSetSetupDone(op)); + CeedCallBackend(CeedQFunctionDestroy(&qf)); return CEED_ERROR_SUCCESS; } @@ -148,14 +212,15 @@ static inline int CeedOperatorSetupInputs_Ref(CeedInt num_input_fields, CeedQFun CeedVector in_vec, const bool skip_active, CeedScalar *e_data_full[2 * CEED_FIELD_MAX], CeedOperator_Ref *impl, CeedRequest *request) { for (CeedInt i = 0; i < num_input_fields; i++) { - uint64_t state; - CeedEvalMode eval_mode; - CeedVector vec; - CeedElemRestriction elem_rstr; + bool is_active; + uint64_t state; + CeedEvalMode eval_mode; + CeedVector vec; // Get input vector CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); - if (vec == CEED_VECTOR_ACTIVE) { + is_active = vec == CEED_VECTOR_ACTIVE; + if (is_active) { if (skip_active) continue; else vec = in_vec; } @@ -167,14 +232,18 @@ static inline int CeedOperatorSetupInputs_Ref(CeedInt num_input_fields, CeedQFun // Restrict CeedCallBackend(CeedVectorGetState(vec, &state)); // Skip restriction if input is unchanged - if (state != impl->input_states[i] || vec == in_vec) { + if ((state != impl->input_states[i] || vec == in_vec) && !impl->skip_rstr_in[i]) { + CeedElemRestriction elem_rstr; + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr)); CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_NOTRANSPOSE, vec, impl->e_vecs_full[i], request)); - impl->input_states[i] = state; + CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); } + impl->input_states[i] = state; // Get evec CeedCallBackend(CeedVectorGetArrayRead(impl->e_vecs_full[i], CEED_MEM_HOST, (const CeedScalar **)&e_data_full[i])); } + if (!is_active) CeedCallBackend(CeedVectorDestroy(&vec)); } return CEED_ERROR_SUCCESS; } @@ -193,14 +262,18 @@ static inline int CeedOperatorInputBasis_Ref(CeedInt e, CeedInt Q, CeedQFunction // Skip active input if (skip_active) { + bool is_active; CeedVector vec; CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); - if (vec == CEED_VECTOR_ACTIVE) continue; + is_active = vec == CEED_VECTOR_ACTIVE; + CeedCallBackend(CeedVectorDestroy(&vec)); + if (is_active) continue; } // Get elem_size, eval_mode, size CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr)); CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); + CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &size)); // Basis action @@ -216,6 +289,7 @@ static inline int CeedOperatorInputBasis_Ref(CeedInt e, CeedInt Q, CeedQFunction CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); CeedCallBackend(CeedVectorSetArray(impl->e_vecs_in[i], CEED_MEM_HOST, CEED_USE_POINTER, &e_data_full[i][(CeedSize)e * elem_size * num_comp])); CeedCallBackend(CeedBasisApply(basis, 1, CEED_NOTRANSPOSE, eval_mode, impl->e_vecs_in[i], impl->q_vecs_in[i])); + CeedCallBackend(CeedBasisDestroy(&basis)); break; case CEED_EVAL_WEIGHT: break; // No action @@ -228,7 +302,7 @@ static inline int CeedOperatorInputBasis_Ref(CeedInt e, CeedInt Q, CeedQFunction // Output Basis Action //------------------------------------------------------------------------------ static inline int CeedOperatorOutputBasis_Ref(CeedInt e, CeedInt Q, CeedQFunctionField *qf_output_fields, CeedOperatorField *op_output_fields, - CeedInt num_input_fields, CeedInt num_output_fields, CeedOperator op, + CeedInt num_input_fields, CeedInt num_output_fields, bool *apply_add_basis, CeedOperator op, CeedScalar *e_data_full[2 * CEED_FIELD_MAX], CeedOperator_Ref *impl) { for (CeedInt i = 0; i < num_output_fields; i++) { CeedInt elem_size, num_comp; @@ -239,6 +313,7 @@ static inline int CeedOperatorOutputBasis_Ref(CeedInt e, CeedInt Q, CeedQFunctio // Get elem_size, eval_mode CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr)); CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); + CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); // Basis action switch (eval_mode) { @@ -252,7 +327,12 @@ static inline int CeedOperatorOutputBasis_Ref(CeedInt e, CeedInt Q, CeedQFunctio CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); CeedCallBackend(CeedVectorSetArray(impl->e_vecs_out[i], CEED_MEM_HOST, CEED_USE_POINTER, &e_data_full[i + num_input_fields][(CeedSize)e * elem_size * num_comp])); - CeedCallBackend(CeedBasisApply(basis, 1, CEED_TRANSPOSE, eval_mode, impl->q_vecs_out[i], impl->e_vecs_out[i])); + if (apply_add_basis[i]) { + CeedCallBackend(CeedBasisApplyAdd(basis, 1, CEED_TRANSPOSE, eval_mode, impl->q_vecs_out[i], impl->e_vecs_out[i])); + } else { + CeedCallBackend(CeedBasisApply(basis, 1, CEED_TRANSPOSE, eval_mode, impl->q_vecs_out[i], impl->e_vecs_out[i])); + } + CeedCallBackend(CeedBasisDestroy(&basis)); break; // LCOV_EXCL_START case CEED_EVAL_WEIGHT: { @@ -274,10 +354,13 @@ static inline int CeedOperatorRestoreInputs_Ref(CeedInt num_input_fields, CeedQF // Skip active inputs if (skip_active) { + bool is_active; CeedVector vec; CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); - if (vec == CEED_VECTOR_ACTIVE) continue; + is_active = vec == CEED_VECTOR_ACTIVE; + CeedCallBackend(CeedVectorDestroy(&vec)); + if (is_active) continue; } // Restore input CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); @@ -301,33 +384,40 @@ static int CeedOperatorApplyAdd_Ref(CeedOperator op, CeedVector in_vec, CeedVect CeedOperatorField *op_input_fields, *op_output_fields; CeedOperator_Ref *impl; - CeedCallBackend(CeedOperatorGetData(op, &impl)); - CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); - CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q)); - CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem)); - CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); - CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); - // Setup CeedCallBackend(CeedOperatorSetup_Ref(op)); + CeedCallBackend(CeedOperatorGetData(op, &impl)); + CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); + // Restriction only operator if (impl->is_identity_rstr_op) { CeedElemRestriction elem_rstr; CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[0], &elem_rstr)); CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_NOTRANSPOSE, in_vec, impl->e_vecs_full[0], request)); + CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[0], &elem_rstr)); CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_TRANSPOSE, impl->e_vecs_full[0], out_vec, request)); + CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); return CEED_ERROR_SUCCESS; } + CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); + CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q)); + CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem)); + CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); + // Input Evecs and Restriction CeedCallBackend(CeedOperatorSetupInputs_Ref(num_input_fields, qf_input_fields, op_input_fields, in_vec, false, e_data_full, impl, request)); // Output Evecs - for (CeedInt i = 0; i < num_output_fields; i++) { - CeedCallBackend(CeedVectorGetArrayWrite(impl->e_vecs_full[i + impl->num_inputs], CEED_MEM_HOST, &e_data_full[i + num_input_fields])); + for (CeedInt i = num_output_fields - 1; i >= 0; i--) { + if (impl->skip_rstr_out[i]) { + e_data_full[i + num_input_fields] = e_data_full[impl->e_data_out_indices[i] + num_input_fields]; + } else { + CeedCallBackend(CeedVectorGetArrayWrite(impl->e_vecs_full[i + impl->num_inputs], CEED_MEM_HOST, &e_data_full[i + num_input_fields])); + } } // Loop through elements @@ -337,8 +427,8 @@ static int CeedOperatorApplyAdd_Ref(CeedOperator op, CeedVector in_vec, CeedVect CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); if (eval_mode == CEED_EVAL_NONE) { CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &size)); - CeedCallBackend( - CeedVectorSetArray(impl->q_vecs_out[i], CEED_MEM_HOST, CEED_USE_POINTER, &e_data_full[i + num_input_fields][(CeedSize)e * Q * size])); + CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[i], CEED_MEM_HOST, CEED_USE_POINTER, + &e_data_full[i + num_input_fields][(CeedSize)e * Q * size])); } } @@ -351,28 +441,34 @@ static int CeedOperatorApplyAdd_Ref(CeedOperator op, CeedVector in_vec, CeedVect } // Output basis apply - CeedCallBackend( - CeedOperatorOutputBasis_Ref(e, Q, qf_output_fields, op_output_fields, num_input_fields, num_output_fields, op, e_data_full, impl)); + CeedCallBackend(CeedOperatorOutputBasis_Ref(e, Q, qf_output_fields, op_output_fields, num_input_fields, num_output_fields, + impl->apply_add_basis_out, op, e_data_full, impl)); } // Output restriction for (CeedInt i = 0; i < num_output_fields; i++) { + bool is_active; CeedVector vec; CeedElemRestriction elem_rstr; + if (impl->skip_rstr_out[i]) continue; // Restore Evec CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs_full[i + impl->num_inputs], &e_data_full[i + num_input_fields])); // Get output vector CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec)); // Active - if (vec == CEED_VECTOR_ACTIVE) vec = out_vec; + is_active = vec == CEED_VECTOR_ACTIVE; + if (is_active) vec = out_vec; // Restrict CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr)); CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_TRANSPOSE, impl->e_vecs_full[i + impl->num_inputs], vec, request)); + if (!is_active) CeedCallBackend(CeedVectorDestroy(&vec)); + CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); } // Restore input arrays CeedCallBackend(CeedOperatorRestoreInputs_Ref(num_input_fields, qf_input_fields, op_input_fields, false, e_data_full, impl)); + CeedCallBackend(CeedQFunctionDestroy(&qf)); return CEED_ERROR_SUCCESS; } @@ -381,21 +477,18 @@ static int CeedOperatorApplyAdd_Ref(CeedOperator op, CeedVector in_vec, CeedVect //------------------------------------------------------------------------------ static inline int CeedOperatorLinearAssembleQFunctionCore_Ref(CeedOperator op, bool build_objects, CeedVector *assembled, CeedElemRestriction *rstr, CeedRequest *request) { - Ceed ceed, ceed_parent; - CeedSize q_size; - CeedInt num_active_in, num_active_out, Q, num_elem, num_input_fields, num_output_fields, size; + Ceed ceed_parent; + CeedInt qf_size_in, qf_size_out, Q, num_elem, num_input_fields, num_output_fields; CeedScalar *assembled_array, *e_data_full[2 * CEED_FIELD_MAX] = {NULL}; - CeedVector *active_in; CeedQFunctionField *qf_input_fields, *qf_output_fields; CeedQFunction qf; CeedOperatorField *op_input_fields, *op_output_fields; CeedOperator_Ref *impl; - CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); CeedCallBackend(CeedOperatorGetFallbackParentCeed(op, &ceed_parent)); CeedCallBackend(CeedOperatorGetData(op, &impl)); - active_in = impl->qf_active_in; - num_active_in = impl->num_active_in, num_active_out = impl->num_active_out; + qf_size_in = impl->qf_size_in; + qf_size_out = impl->qf_size_out; CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q)); CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem)); @@ -406,65 +499,58 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Ref(CeedOperator op, b CeedCallBackend(CeedOperatorSetup_Ref(op)); // Check for restriction only operator - CeedCheck(!impl->is_identity_rstr_op, ceed, CEED_ERROR_BACKEND, "Assembling restriction only operators is not supported"); + CeedCheck(!impl->is_identity_rstr_op, CeedOperatorReturnCeed(op), CEED_ERROR_BACKEND, "Assembling restriction only operators is not supported"); // Input Evecs and Restriction CeedCallBackend(CeedOperatorSetupInputs_Ref(num_input_fields, qf_input_fields, op_input_fields, NULL, true, e_data_full, impl, request)); // Count number of active input fields - if (!num_active_in) { + if (qf_size_in == 0) { for (CeedInt i = 0; i < num_input_fields; i++) { - CeedScalar *q_vec_array; - CeedVector vec; + CeedInt field_size; + CeedVector vec; // Get input vector CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); // Check if active input if (vec == CEED_VECTOR_ACTIVE) { - CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &size)); + CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &field_size)); CeedCallBackend(CeedVectorSetValue(impl->q_vecs_in[i], 0.0)); - CeedCallBackend(CeedVectorGetArray(impl->q_vecs_in[i], CEED_MEM_HOST, &q_vec_array)); - CeedCallBackend(CeedRealloc(num_active_in + size, &active_in)); - for (CeedInt field = 0; field < size; field++) { - q_size = (CeedSize)Q; - CeedCallBackend(CeedVectorCreate(ceed_parent, q_size, &active_in[num_active_in + field])); - CeedCallBackend(CeedVectorSetArray(active_in[num_active_in + field], CEED_MEM_HOST, CEED_USE_POINTER, &q_vec_array[field * Q])); - } - num_active_in += size; - CeedCallBackend(CeedVectorRestoreArray(impl->q_vecs_in[i], &q_vec_array)); + qf_size_in += field_size; } + CeedCallBackend(CeedVectorDestroy(&vec)); } - impl->num_active_in = num_active_in; - impl->qf_active_in = active_in; + CeedCheck(qf_size_in > 0, CeedOperatorReturnCeed(op), CEED_ERROR_BACKEND, "Cannot assemble QFunction without active inputs and outputs"); + impl->qf_size_in = qf_size_in; } // Count number of active output fields - if (!num_active_out) { + if (qf_size_out == 0) { for (CeedInt i = 0; i < num_output_fields; i++) { + CeedInt field_size; CeedVector vec; // Get output vector CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec)); // Check if active output if (vec == CEED_VECTOR_ACTIVE) { - CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &size)); - num_active_out += size; + CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &field_size)); + qf_size_out += field_size; } + CeedCallBackend(CeedVectorDestroy(&vec)); } - impl->num_active_out = num_active_out; + CeedCheck(qf_size_out > 0, CeedOperatorReturnCeed(op), CEED_ERROR_BACKEND, "Cannot assemble QFunction without active inputs and outputs"); + impl->qf_size_out = qf_size_out; } - // Check sizes - CeedCheck(num_active_in > 0 && num_active_out > 0, ceed, CEED_ERROR_BACKEND, "Cannot assemble QFunction without active inputs and outputs"); - // Build objects if needed if (build_objects) { - const CeedSize l_size = (CeedSize)num_elem * Q * num_active_in * num_active_out; - CeedInt strides[3] = {1, Q, num_active_in * num_active_out * Q}; /* *NOPAD* */ + const CeedSize l_size = (CeedSize)num_elem * Q * qf_size_in * qf_size_out; + CeedInt strides[3] = {1, Q, qf_size_in * qf_size_out * Q}; /* *NOPAD* */ // Create output restriction - CeedCallBackend(CeedElemRestrictionCreateStrided(ceed_parent, num_elem, Q, num_active_in * num_active_out, - num_active_in * num_active_out * num_elem * Q, strides, rstr)); + CeedCallBackend(CeedElemRestrictionCreateStrided(ceed_parent, num_elem, Q, qf_size_in * qf_size_out, + (CeedSize)qf_size_in * (CeedSize)qf_size_out * (CeedSize)num_elem * (CeedSize)Q, strides, rstr)); // Create assembled vector CeedCallBackend(CeedVectorCreate(ceed_parent, l_size, assembled)); } @@ -478,37 +564,66 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Ref(CeedOperator op, b CeedCallBackend(CeedOperatorInputBasis_Ref(e, Q, qf_input_fields, op_input_fields, num_input_fields, true, e_data_full, impl)); // Assemble QFunction - for (CeedInt in = 0; in < num_active_in; in++) { + + for (CeedInt i = 0; i < num_input_fields; i++) { + bool is_active; + CeedInt field_size; + CeedVector vec; + // Set Inputs - CeedCallBackend(CeedVectorSetValue(active_in[in], 1.0)); - if (num_active_in > 1) { - CeedCallBackend(CeedVectorSetValue(active_in[(in + num_active_in - 1) % num_active_in], 0.0)); - } - if (!impl->is_identity_qf) { - // Set Outputs - for (CeedInt out = 0; out < num_output_fields; out++) { - CeedVector vec; - - // Get output vector - CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[out], &vec)); - // Check if active output - if (vec == CEED_VECTOR_ACTIVE) { - CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[out], CEED_MEM_HOST, CEED_USE_POINTER, assembled_array)); - CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[out], &size)); - assembled_array += size * Q; // Advance the pointer by the size of the output + CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); + is_active = vec == CEED_VECTOR_ACTIVE; + CeedCallBackend(CeedVectorDestroy(&vec)); + if (!is_active) continue; + CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &field_size)); + for (CeedInt field = 0; field < field_size; field++) { + // Set current portion of input to 1.0 + { + CeedScalar *array; + + CeedCallBackend(CeedVectorGetArray(impl->q_vecs_in[i], CEED_MEM_HOST, &array)); + for (CeedInt j = 0; j < Q; j++) array[field * Q + j] = 1.0; + CeedCallBackend(CeedVectorRestoreArray(impl->q_vecs_in[i], &array)); + } + + if (!impl->is_identity_qf) { + // Set Outputs + for (CeedInt out = 0; out < num_output_fields; out++) { + CeedVector vec; + + // Get output vector + CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[out], &vec)); + // Check if active output + if (vec == CEED_VECTOR_ACTIVE) { + CeedInt field_size; + + CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[out], CEED_MEM_HOST, CEED_USE_POINTER, assembled_array)); + CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[out], &field_size)); + assembled_array += field_size * Q; // Advance the pointer by the size of the output + } + CeedCallBackend(CeedVectorDestroy(&vec)); } + // Apply QFunction + CeedCallBackend(CeedQFunctionApply(qf, Q, impl->q_vecs_in, impl->q_vecs_out)); + } else { + CeedInt field_size; + const CeedScalar *array; + + // Copy Identity Outputs + CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[0], &field_size)); + CeedCallBackend(CeedVectorGetArrayRead(impl->q_vecs_out[0], CEED_MEM_HOST, &array)); + for (CeedInt j = 0; j < field_size * Q; j++) assembled_array[j] = array[j]; + CeedCallBackend(CeedVectorRestoreArrayRead(impl->q_vecs_out[0], &array)); + assembled_array += field_size * Q; + } + // Reset input to 0.0 + { + CeedScalar *array; + + CeedCallBackend(CeedVectorGetArray(impl->q_vecs_in[i], CEED_MEM_HOST, &array)); + for (CeedInt j = 0; j < Q; j++) array[field * Q + j] = 0.0; + CeedCallBackend(CeedVectorRestoreArray(impl->q_vecs_in[i], &array)); } - // Apply QFunction - CeedCallBackend(CeedQFunctionApply(qf, Q, impl->q_vecs_in, impl->q_vecs_out)); - } else { - const CeedScalar *q_vec_array; - - // Copy Identity Outputs - CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[0], &size)); - CeedCallBackend(CeedVectorGetArrayRead(impl->q_vecs_out[0], CEED_MEM_HOST, &q_vec_array)); - for (CeedInt i = 0; i < size * Q; i++) assembled_array[i] = q_vec_array[i]; - CeedCallBackend(CeedVectorRestoreArrayRead(impl->q_vecs_out[0], &q_vec_array)); - assembled_array += size * Q; } } } @@ -524,6 +639,7 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Ref(CeedOperator op, b if (vec == CEED_VECTOR_ACTIVE && num_elem > 0) { CeedCallBackend(CeedVectorTakeArray(impl->q_vecs_out[out], CEED_MEM_HOST, NULL)); } + CeedCallBackend(CeedVectorDestroy(&vec)); } } @@ -532,6 +648,8 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Ref(CeedOperator op, b // Restore output CeedCallBackend(CeedVectorRestoreArray(*assembled, &assembled_array)); + CeedCallBackend(CeedDestroy(&ceed_parent)); + CeedCallBackend(CeedQFunctionDestroy(&qf)); return CEED_ERROR_SUCCESS; } @@ -552,11 +670,12 @@ static int CeedOperatorLinearAssembleQFunctionUpdate_Ref(CeedOperator op, CeedVe //------------------------------------------------------------------------------ // Setup Input/Output Fields //------------------------------------------------------------------------------ -static int CeedOperatorSetupFieldsAtPoints_Ref(CeedQFunction qf, CeedOperator op, bool is_input, CeedVector *e_vecs_full, CeedVector *e_vecs, - CeedVector *q_vecs, CeedInt start_e, CeedInt num_fields, CeedInt Q) { +static int CeedOperatorSetupFieldsAtPoints_Ref(CeedQFunction qf, CeedOperator op, bool is_input, bool *skip_rstr, bool *apply_add_basis, + CeedVector *e_vecs_full, CeedVector *e_vecs, CeedVector *q_vecs, CeedInt start_e, CeedInt num_fields, + CeedInt Q) { Ceed ceed; CeedSize e_size, q_size; - CeedInt e_size_padding = 0, max_num_points, num_comp, size, P; + CeedInt max_num_points, num_comp, size, P; CeedQFunctionField *qf_fields; CeedOperatorField *op_fields; @@ -565,7 +684,8 @@ static int CeedOperatorSetupFieldsAtPoints_Ref(CeedQFunction qf, CeedOperator op CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); CeedCallBackend(CeedGetParent(ceed, &ceed_parent)); - if (ceed_parent) ceed = ceed_parent; + CeedCallBackend(CeedReferenceCopy(ceed_parent, &ceed)); + CeedCallBackend(CeedDestroy(&ceed_parent)); } if (is_input) { CeedCallBackend(CeedOperatorGetFields(op, NULL, &op_fields, NULL, NULL)); @@ -600,26 +720,11 @@ static int CeedOperatorSetupFieldsAtPoints_Ref(CeedQFunction qf, CeedOperator op CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode)); if (eval_mode != CEED_EVAL_WEIGHT) { CeedElemRestriction elem_rstr; - CeedSize e_size; - bool is_at_points; CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &elem_rstr)); - CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); - CeedCallBackend(CeedElemRestrictionIsPoints(elem_rstr, &is_at_points)); - if (is_at_points) { - CeedCallBackend(CeedElemRestrictionGetEVectorSize(elem_rstr, &e_size)); - if (e_size_padding == 0) { - CeedInt num_points, num_elem; - - CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem)); - CeedCallBackend(CeedElemRestrictionGetNumPointsInElement(elem_rstr, num_elem - 1, &num_points)); - e_size_padding = (max_num_points - num_points) * num_comp; - } - CeedCallBackend(CeedVectorCreate(ceed, e_size + e_size_padding, &e_vecs_full[i + start_e])); - CeedCallBackend(CeedVectorSetValue(e_vecs_full[i + start_e], 0.0)); - } else { - CeedCallBackend(CeedElemRestrictionCreateVector(elem_rstr, NULL, &e_vecs_full[i + start_e])); - } + CeedCallBackend(CeedElemRestrictionCreateVector(elem_rstr, NULL, &e_vecs_full[i + start_e])); + CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); + CeedCallBackend(CeedVectorSetValue(e_vecs_full[i + start_e], 0.0)); } switch (eval_mode) { @@ -636,6 +741,7 @@ static int CeedOperatorSetupFieldsAtPoints_Ref(CeedQFunction qf, CeedOperator op q_size = (CeedSize)max_num_points * size; CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i])); } + CeedCallBackend(CeedVectorDestroy(&vec)); break; } case CEED_EVAL_INTERP: @@ -650,19 +756,73 @@ static int CeedOperatorSetupFieldsAtPoints_Ref(CeedQFunction qf, CeedOperator op CeedCallBackend(CeedVectorCreate(ceed, e_size, &e_vecs[i])); q_size = (CeedSize)max_num_points * size; CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i])); + CeedCallBackend(CeedBasisDestroy(&basis)); break; case CEED_EVAL_WEIGHT: // Only on input fields CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis)); q_size = (CeedSize)max_num_points; CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i])); - CeedCallBackend( - CeedBasisApplyAtPoints(basis, max_num_points, CEED_NOTRANSPOSE, CEED_EVAL_WEIGHT, CEED_VECTOR_NONE, CEED_VECTOR_NONE, q_vecs[i])); + CeedCallBackend(CeedBasisApplyAtPoints(basis, 1, &max_num_points, CEED_NOTRANSPOSE, CEED_EVAL_WEIGHT, CEED_VECTOR_NONE, CEED_VECTOR_NONE, + q_vecs[i])); + CeedCallBackend(CeedBasisDestroy(&basis)); break; } // Initialize full arrays for E-vectors and Q-vectors if (e_vecs[i]) CeedCallBackend(CeedVectorSetValue(e_vecs[i], 0.0)); if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorSetValue(q_vecs[i], 0.0)); } + // Drop duplicate restrictions + if (is_input) { + for (CeedInt i = 0; i < num_fields; i++) { + CeedVector vec_i; + CeedElemRestriction rstr_i; + + CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec_i)); + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &rstr_i)); + for (CeedInt j = i + 1; j < num_fields; j++) { + CeedVector vec_j; + CeedElemRestriction rstr_j; + + CeedCallBackend(CeedOperatorFieldGetVector(op_fields[j], &vec_j)); + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[j], &rstr_j)); + if (vec_i == vec_j && rstr_i == rstr_j) { + CeedCallBackend(CeedVectorReferenceCopy(e_vecs[i], &e_vecs[j])); + CeedCallBackend(CeedVectorReferenceCopy(e_vecs_full[i + start_e], &e_vecs_full[j + start_e])); + skip_rstr[j] = true; + } + CeedCallBackend(CeedVectorDestroy(&vec_j)); + CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j)); + } + CeedCallBackend(CeedVectorDestroy(&vec_i)); + CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i)); + } + } else { + for (CeedInt i = num_fields - 1; i >= 0; i--) { + CeedVector vec_i; + CeedElemRestriction rstr_i; + + CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec_i)); + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &rstr_i)); + for (CeedInt j = i - 1; j >= 0; j--) { + CeedVector vec_j; + CeedElemRestriction rstr_j; + + CeedCallBackend(CeedOperatorFieldGetVector(op_fields[j], &vec_j)); + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[j], &rstr_j)); + if (vec_i == vec_j && rstr_i == rstr_j) { + CeedCallBackend(CeedVectorReferenceCopy(e_vecs[i], &e_vecs[j])); + CeedCallBackend(CeedVectorReferenceCopy(e_vecs_full[i + start_e], &e_vecs_full[j + start_e])); + skip_rstr[j] = true; + apply_add_basis[i] = true; + } + CeedCallBackend(CeedVectorDestroy(&vec_j)); + CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j)); + } + CeedCallBackend(CeedVectorDestroy(&vec_i)); + CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i)); + } + } + CeedCallBackend(CeedDestroy(&ceed)); return CEED_ERROR_SUCCESS; } @@ -690,6 +850,9 @@ static int CeedOperatorSetupAtPoints_Ref(CeedOperator op) { // Allocate CeedCallBackend(CeedCalloc(num_input_fields + num_output_fields, &impl->e_vecs_full)); + CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->skip_rstr_in)); + CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->skip_rstr_out)); + CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->apply_add_basis_out)); CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->input_states)); CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->e_vecs_in)); CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->e_vecs_out)); @@ -701,10 +864,11 @@ static int CeedOperatorSetupAtPoints_Ref(CeedOperator op) { // Set up infield and outfield pointer arrays // Infields - CeedCallBackend(CeedOperatorSetupFieldsAtPoints_Ref(qf, op, true, impl->e_vecs_full, impl->e_vecs_in, impl->q_vecs_in, 0, num_input_fields, Q)); + CeedCallBackend(CeedOperatorSetupFieldsAtPoints_Ref(qf, op, true, impl->skip_rstr_in, NULL, impl->e_vecs_full, impl->e_vecs_in, impl->q_vecs_in, 0, + num_input_fields, Q)); // Outfields - CeedCallBackend(CeedOperatorSetupFieldsAtPoints_Ref(qf, op, false, impl->e_vecs_full, impl->e_vecs_out, impl->q_vecs_out, num_input_fields, - num_output_fields, Q)); + CeedCallBackend(CeedOperatorSetupFieldsAtPoints_Ref(qf, op, false, impl->skip_rstr_out, impl->apply_add_basis_out, impl->e_vecs_full, + impl->e_vecs_out, impl->q_vecs_out, num_input_fields, num_output_fields, Q)); // Identity QFunctions if (impl->is_identity_qf) { @@ -713,6 +877,7 @@ static int CeedOperatorSetupAtPoints_Ref(CeedOperator op) { } CeedCallBackend(CeedOperatorSetSetupDone(op)); + CeedCallBackend(CeedQFunctionDestroy(&qf)); return CEED_ERROR_SUCCESS; } @@ -721,10 +886,10 @@ static int CeedOperatorSetupAtPoints_Ref(CeedOperator op) { //------------------------------------------------------------------------------ static inline int CeedOperatorInputBasisAtPoints_Ref(CeedInt e, CeedInt num_points_offset, CeedInt num_points, CeedQFunctionField *qf_input_fields, CeedOperatorField *op_input_fields, CeedInt num_input_fields, CeedVector in_vec, - CeedVector point_coords_elem, bool skip_active, CeedScalar *e_data[2 * CEED_FIELD_MAX], - CeedOperator_Ref *impl, CeedRequest *request) { + CeedVector point_coords_elem, bool skip_active, bool skip_passive, + CeedScalar *e_data[2 * CEED_FIELD_MAX], CeedOperator_Ref *impl, CeedRequest *request) { for (CeedInt i = 0; i < num_input_fields; i++) { - bool is_active_input = false; + bool is_active; CeedInt elem_size, size, num_comp; CeedRestrictionType rstr_type; CeedEvalMode eval_mode; @@ -732,10 +897,12 @@ static inline int CeedOperatorInputBasisAtPoints_Ref(CeedInt e, CeedInt num_poin CeedElemRestriction elem_rstr; CeedBasis basis; - CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); // Skip active input - is_active_input = vec == CEED_VECTOR_ACTIVE; - if (skip_active && is_active_input) continue; + CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); + is_active = vec == CEED_VECTOR_ACTIVE; + CeedCallBackend(CeedVectorDestroy(&vec)); + if (skip_active && is_active) continue; + if (skip_passive && !is_active) continue; // Get elem_size, eval_mode, size CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr)); @@ -743,7 +910,8 @@ static inline int CeedOperatorInputBasisAtPoints_Ref(CeedInt e, CeedInt num_poin CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &size)); // Restrict block active input - if (is_active_input) { + // When skipping passive inputs, we're doing assembly and should not restrict + if (is_active && !impl->skip_rstr_in[i] && !skip_passive) { if (rstr_type == CEED_RESTRICTION_POINTS) { CeedCallBackend(CeedElemRestrictionApplyAtPointsInElement(elem_rstr, e, CEED_NOTRANSPOSE, in_vec, impl->e_vecs_in[i], request)); } else { @@ -753,7 +921,7 @@ static inline int CeedOperatorInputBasisAtPoints_Ref(CeedInt e, CeedInt num_poin // Basis action switch (eval_mode) { case CEED_EVAL_NONE: - if (!is_active_input) { + if (!is_active) { CeedCallBackend(CeedVectorSetArray(impl->q_vecs_in[i], CEED_MEM_HOST, CEED_USE_POINTER, &e_data[i][num_points_offset * size])); } break; @@ -763,17 +931,19 @@ static inline int CeedOperatorInputBasisAtPoints_Ref(CeedInt e, CeedInt num_poin case CEED_EVAL_DIV: case CEED_EVAL_CURL: CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); - if (!is_active_input) { + if (!is_active) { CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); CeedCallBackend(CeedVectorSetArray(impl->e_vecs_in[i], CEED_MEM_HOST, CEED_USE_POINTER, &e_data[i][(CeedSize)e * elem_size * num_comp])); } - CeedCallBackend( - CeedBasisApplyAtPoints(basis, num_points, CEED_NOTRANSPOSE, eval_mode, point_coords_elem, impl->e_vecs_in[i], impl->q_vecs_in[i])); + CeedCallBackend(CeedBasisApplyAtPoints(basis, 1, &num_points, CEED_NOTRANSPOSE, eval_mode, point_coords_elem, impl->e_vecs_in[i], + impl->q_vecs_in[i])); + CeedCallBackend(CeedBasisDestroy(&basis)); break; case CEED_EVAL_WEIGHT: break; // No action } + CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); } return CEED_ERROR_SUCCESS; } @@ -783,15 +953,22 @@ static inline int CeedOperatorInputBasisAtPoints_Ref(CeedInt e, CeedInt num_poin //------------------------------------------------------------------------------ static inline int CeedOperatorOutputBasisAtPoints_Ref(CeedInt e, CeedInt num_points_offset, CeedInt num_points, CeedQFunctionField *qf_output_fields, CeedOperatorField *op_output_fields, CeedInt num_input_fields, CeedInt num_output_fields, - CeedOperator op, CeedVector out_vec, CeedVector point_coords_elem, CeedOperator_Ref *impl, - CeedRequest *request) { + bool *apply_add_basis, bool *skip_rstr, CeedOperator op, CeedVector out_vec, + CeedVector point_coords_elem, bool skip_passive, CeedOperator_Ref *impl, CeedRequest *request) { for (CeedInt i = 0; i < num_output_fields; i++) { + bool is_active; CeedRestrictionType rstr_type; CeedEvalMode eval_mode; CeedVector vec; CeedElemRestriction elem_rstr; CeedBasis basis; + // Skip active input + CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec)); + is_active = vec == CEED_VECTOR_ACTIVE; + CeedCallBackend(CeedVectorDestroy(&vec)); + if (skip_passive && !is_active) continue; + // Get elem_size, eval_mode, size CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr)); CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); @@ -804,8 +981,14 @@ static inline int CeedOperatorOutputBasisAtPoints_Ref(CeedInt e, CeedInt num_poi case CEED_EVAL_DIV: case CEED_EVAL_CURL: CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); - CeedCallBackend( - CeedBasisApplyAtPoints(basis, num_points, CEED_TRANSPOSE, eval_mode, point_coords_elem, impl->q_vecs_out[i], impl->e_vecs_out[i])); + if (apply_add_basis[i]) { + CeedCallBackend(CeedBasisApplyAddAtPoints(basis, 1, &num_points, CEED_TRANSPOSE, eval_mode, point_coords_elem, impl->q_vecs_out[i], + impl->e_vecs_out[i])); + } else { + CeedCallBackend(CeedBasisApplyAtPoints(basis, 1, &num_points, CEED_TRANSPOSE, eval_mode, point_coords_elem, impl->q_vecs_out[i], + impl->e_vecs_out[i])); + } + CeedCallBackend(CeedBasisDestroy(&basis)); break; // LCOV_EXCL_START case CEED_EVAL_WEIGHT: { @@ -814,16 +997,24 @@ static inline int CeedOperatorOutputBasisAtPoints_Ref(CeedInt e, CeedInt num_poi } } // Restrict output block + // When skipping passive outputs, we're doing assembly and should not restrict + if (skip_rstr[i] || skip_passive) { + CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); + continue; + } + // Get output vector CeedCallBackend(CeedElemRestrictionGetType(elem_rstr, &rstr_type)); CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec)); - if (vec == CEED_VECTOR_ACTIVE) vec = out_vec; + if (is_active) vec = out_vec; // Restrict if (rstr_type == CEED_RESTRICTION_POINTS) { CeedCallBackend(CeedElemRestrictionApplyAtPointsInElement(elem_rstr, e, CEED_TRANSPOSE, impl->e_vecs_out[i], vec, request)); } else { CeedCallBackend(CeedElemRestrictionApplyBlock(elem_rstr, e, CEED_TRANSPOSE, impl->e_vecs_out[i], vec, request)); } + if (!is_active) CeedCallBackend(CeedVectorDestroy(&vec)); + CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); } return CEED_ERROR_SUCCESS; } @@ -863,10 +1054,11 @@ static int CeedOperatorApplyAddAtPoints_Ref(CeedOperator op, CeedVector in_vec, // Setup points for element CeedCallBackend(CeedElemRestrictionApplyAtPointsInElement(rstr_points, e, CEED_NOTRANSPOSE, point_coords, impl->point_coords_elem, request)); CeedCallBackend(CeedElemRestrictionGetNumPointsInElement(rstr_points, e, &num_points)); + if (num_points < 1) continue; // Input basis apply CeedCallBackend(CeedOperatorInputBasisAtPoints_Ref(e, num_points_offset, num_points, qf_input_fields, op_input_fields, num_input_fields, in_vec, - impl->point_coords_elem, false, e_data, impl, request)); + impl->point_coords_elem, false, false, e_data, impl, request)); // Q function if (!impl->is_identity_qf) { @@ -875,7 +1067,8 @@ static int CeedOperatorApplyAddAtPoints_Ref(CeedOperator op, CeedVector in_vec, // Output basis apply and restriction CeedCallBackend(CeedOperatorOutputBasisAtPoints_Ref(e, num_points_offset, num_points, qf_output_fields, op_output_fields, num_input_fields, - num_output_fields, op, out_vec, impl->point_coords_elem, impl, request)); + num_output_fields, impl->apply_add_basis_out, impl->skip_rstr_out, op, out_vec, + impl->point_coords_elem, false, impl, request)); num_points_offset += num_points; } @@ -886,6 +1079,7 @@ static int CeedOperatorApplyAddAtPoints_Ref(CeedOperator op, CeedVector in_vec, // Cleanup point coordinates CeedCallBackend(CeedVectorDestroy(&point_coords)); CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points)); + CeedCallBackend(CeedQFunctionDestroy(&qf)); return CEED_ERROR_SUCCESS; } @@ -895,10 +1089,9 @@ static int CeedOperatorApplyAddAtPoints_Ref(CeedOperator op, CeedVector in_vec, static inline int CeedOperatorLinearAssembleQFunctionAtPointsCore_Ref(CeedOperator op, bool build_objects, CeedVector *assembled, CeedElemRestriction *rstr, CeedRequest *request) { Ceed ceed; - CeedSize q_size; - CeedInt num_active_in, num_active_out, max_num_points, num_elem, num_input_fields, num_output_fields, num_points_offset = 0; + CeedInt qf_size_in, qf_size_out, max_num_points, num_elem, num_input_fields, num_output_fields, num_points_offset = 0; CeedScalar *assembled_array, *e_data_full[2 * CEED_FIELD_MAX] = {NULL}; - CeedVector *active_in, point_coords = NULL; + CeedVector point_coords = NULL; CeedQFunctionField *qf_input_fields, *qf_output_fields; CeedQFunction qf; CeedOperatorField *op_input_fields, *op_output_fields; @@ -907,8 +1100,8 @@ static inline int CeedOperatorLinearAssembleQFunctionAtPointsCore_Ref(CeedOperat CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); CeedCallBackend(CeedOperatorGetData(op, &impl)); - active_in = impl->qf_active_in; - num_active_in = impl->num_active_in, num_active_out = impl->num_active_out; + qf_size_in = impl->qf_size_in; + qf_size_out = impl->qf_size_out; CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem)); CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); @@ -928,11 +1121,10 @@ static inline int CeedOperatorLinearAssembleQFunctionAtPointsCore_Ref(CeedOperat CeedCallBackend(CeedOperatorSetupInputs_Ref(num_input_fields, qf_input_fields, op_input_fields, NULL, true, e_data_full, impl, request)); // Count number of active input fields - if (!num_active_in) { + if (qf_size_in == 0) { for (CeedInt i = 0; i < num_input_fields; i++) { - CeedScalar *q_vec_array; - CeedInt field_size; - CeedVector vec; + CeedInt field_size; + CeedVector vec; // Get input vector CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); @@ -944,32 +1136,25 @@ static inline int CeedOperatorLinearAssembleQFunctionAtPointsCore_Ref(CeedOperat bool is_at_points = false; CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr)); - CeedCallBackend(CeedElemRestrictionIsPoints(elem_rstr, &is_at_points)); + CeedCallBackend(CeedElemRestrictionIsAtPoints(elem_rstr, &is_at_points)); + CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); CeedCheck(!is_at_points, ceed, CEED_ERROR_BACKEND, "Cannot assemble QFunction with active input at points"); } // Get size of active input CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &field_size)); - CeedCallBackend(CeedVectorSetValue(impl->q_vecs_in[i], 0.0)); - CeedCallBackend(CeedVectorGetArray(impl->q_vecs_in[i], CEED_MEM_HOST, &q_vec_array)); - CeedCallBackend(CeedRealloc(num_active_in + field_size, &active_in)); - for (CeedInt field = 0; field < field_size; field++) { - q_size = (CeedSize)max_num_points; - CeedCallBackend(CeedVectorCreate(ceed, q_size, &active_in[num_active_in + field])); - CeedCallBackend(CeedVectorSetArray(active_in[num_active_in + field], CEED_MEM_HOST, CEED_USE_POINTER, &q_vec_array[field * q_size])); - } - num_active_in += field_size; - CeedCallBackend(CeedVectorRestoreArray(impl->q_vecs_in[i], &q_vec_array)); + qf_size_in += field_size; } + CeedCallBackend(CeedVectorDestroy(&vec)); } - impl->num_active_in = num_active_in; - impl->qf_active_in = active_in; + CeedCheck(qf_size_in, ceed, CEED_ERROR_BACKEND, "Cannot assemble QFunction without active inputs and outputs"); + impl->qf_size_in = qf_size_in; } // Count number of active output fields - if (!num_active_out) { + if (qf_size_out == 0) { for (CeedInt i = 0; i < num_output_fields; i++) { - CeedVector vec; CeedInt field_size; + CeedVector vec; // Get output vector CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec)); @@ -981,20 +1166,21 @@ static inline int CeedOperatorLinearAssembleQFunctionAtPointsCore_Ref(CeedOperat bool is_at_points = false; CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr)); - CeedCallBackend(CeedElemRestrictionIsPoints(elem_rstr, &is_at_points)); + CeedCallBackend(CeedElemRestrictionIsAtPoints(elem_rstr, &is_at_points)); + CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); CeedCheck(!is_at_points, ceed, CEED_ERROR_BACKEND, "Cannot assemble QFunction with active input at points"); } // Get size of active output CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &field_size)); - num_active_out += field_size; + CeedCallBackend(CeedVectorSetValue(impl->q_vecs_in[i], 0.0)); + qf_size_out += field_size; } + CeedCallBackend(CeedVectorDestroy(&vec)); } - impl->num_active_out = num_active_out; + CeedCheck(qf_size_out > 0, ceed, CEED_ERROR_BACKEND, "Cannot assemble QFunction without active inputs and outputs"); + impl->qf_size_out = qf_size_out; } - // Check sizes - CeedCheck(num_active_in > 0 && num_active_out > 0, ceed, CEED_ERROR_BACKEND, "Cannot assemble QFunction without active inputs and outputs"); - // Build objects if needed if (build_objects) { CeedInt num_points_total; @@ -1004,9 +1190,8 @@ static inline int CeedOperatorLinearAssembleQFunctionAtPointsCore_Ref(CeedOperat // Create output restriction (at points) CeedCallBackend(CeedElemRestrictionGetOffsets(rstr_points, CEED_MEM_HOST, &offsets)); - CeedCallBackend(CeedElemRestrictionCreateAtPoints(ceed, num_elem, num_points_total, num_active_in * num_active_out, - num_active_in * num_active_out * num_points_total, CEED_MEM_HOST, CEED_COPY_VALUES, offsets, - rstr)); + CeedCallBackend(CeedElemRestrictionCreateAtPoints(ceed, num_elem, num_points_total, qf_size_in * qf_size_out, + qf_size_in * qf_size_out * num_points_total, CEED_MEM_HOST, CEED_COPY_VALUES, offsets, rstr)); CeedCallBackend(CeedElemRestrictionRestoreOffsets(rstr_points, &offsets)); // Create assembled vector @@ -1023,45 +1208,73 @@ static inline int CeedOperatorLinearAssembleQFunctionAtPointsCore_Ref(CeedOperat // Setup points for element CeedCallBackend(CeedElemRestrictionApplyAtPointsInElement(rstr_points, e, CEED_NOTRANSPOSE, point_coords, impl->point_coords_elem, request)); CeedCallBackend(CeedElemRestrictionGetNumPointsInElement(rstr_points, e, &num_points)); + if (num_points < 1) continue; // Input basis apply CeedCallBackend(CeedOperatorInputBasisAtPoints_Ref(e, num_points_offset, num_points, qf_input_fields, op_input_fields, num_input_fields, NULL, - impl->point_coords_elem, true, e_data_full, impl, request)); + impl->point_coords_elem, true, false, e_data_full, impl, request)); // Assemble QFunction - for (CeedInt in = 0; in < num_active_in; in++) { - // Set Inputs - CeedCallBackend(CeedVectorSetValue(active_in[in], 1.0)); - if (num_active_in > 1) { - CeedCallBackend(CeedVectorSetValue(active_in[(in + num_active_in - 1) % num_active_in], 0.0)); - } - if (!impl->is_identity_qf) { - // Set Outputs - for (CeedInt out = 0; out < num_output_fields; out++) { - CeedVector vec; - CeedInt field_size; - - // Get output vector - CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[out], &vec)); - // Check if active output - if (vec == CEED_VECTOR_ACTIVE) { - CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[out], CEED_MEM_HOST, CEED_USE_POINTER, assembled_array)); - CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[out], &field_size)); - assembled_array += field_size * num_points; // Advance the pointer by the size of the output + for (CeedInt i = 0; i < num_input_fields; i++) { + bool is_active; + CeedInt field_size; + CeedVector vec; + + // Get input vector + CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); + is_active = vec == CEED_VECTOR_ACTIVE; + CeedCallBackend(CeedVectorDestroy(&vec)); + // Check if active input + if (!is_active) continue; + // Get size of active input + CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &field_size)); + for (CeedInt field = 0; field < field_size; field++) { + // Set current portion of input to 1.0 + { + CeedScalar *array; + + CeedCallBackend(CeedVectorGetArray(impl->q_vecs_in[i], CEED_MEM_HOST, &array)); + for (CeedInt j = 0; j < num_points; j++) array[field * num_points + j] = 1.0; + CeedCallBackend(CeedVectorRestoreArray(impl->q_vecs_in[i], &array)); + } + + if (!impl->is_identity_qf) { + // Set Outputs + for (CeedInt out = 0; out < num_output_fields; out++) { + CeedVector vec; + CeedInt field_size; + + // Get output vector + CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[out], &vec)); + // Check if active output + if (vec == CEED_VECTOR_ACTIVE) { + CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[out], CEED_MEM_HOST, CEED_USE_POINTER, assembled_array)); + CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[out], &field_size)); + assembled_array += field_size * num_points; // Advance the pointer by the size of the output + } + CeedCallBackend(CeedVectorDestroy(&vec)); } + // Apply QFunction + CeedCallBackend(CeedQFunctionApply(qf, num_points, impl->q_vecs_in, impl->q_vecs_out)); + } else { + const CeedScalar *array; + CeedInt field_size; + + // Copy Identity Outputs + CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[0], &field_size)); + CeedCallBackend(CeedVectorGetArrayRead(impl->q_vecs_out[0], CEED_MEM_HOST, &array)); + for (CeedInt j = 0; j < field_size * num_points; j++) assembled_array[j] = array[j]; + CeedCallBackend(CeedVectorRestoreArrayRead(impl->q_vecs_out[0], &array)); + assembled_array += field_size * num_points; + } + // Reset input to 0.0 + { + CeedScalar *array; + + CeedCallBackend(CeedVectorGetArray(impl->q_vecs_in[i], CEED_MEM_HOST, &array)); + for (CeedInt j = 0; j < num_points; j++) array[field * num_points + j] = 0.0; + CeedCallBackend(CeedVectorRestoreArray(impl->q_vecs_in[i], &array)); } - // Apply QFunction - CeedCallBackend(CeedQFunctionApply(qf, num_points, impl->q_vecs_in, impl->q_vecs_out)); - } else { - const CeedScalar *q_vec_array; - CeedInt field_size; - - // Copy Identity Outputs - CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[0], &field_size)); - CeedCallBackend(CeedVectorGetArrayRead(impl->q_vecs_out[0], CEED_MEM_HOST, &q_vec_array)); - for (CeedInt i = 0; i < field_size * num_points; i++) assembled_array[i] = q_vec_array[i]; - CeedCallBackend(CeedVectorRestoreArrayRead(impl->q_vecs_out[0], &q_vec_array)); - assembled_array += field_size * num_points; } } num_points_offset += num_points; @@ -1078,6 +1291,7 @@ static inline int CeedOperatorLinearAssembleQFunctionAtPointsCore_Ref(CeedOperat if (vec == CEED_VECTOR_ACTIVE && num_elem > 0) { CeedCallBackend(CeedVectorTakeArray(impl->q_vecs_out[out], CEED_MEM_HOST, NULL)); } + CeedCallBackend(CeedVectorDestroy(&vec)); } } @@ -1088,8 +1302,10 @@ static inline int CeedOperatorLinearAssembleQFunctionAtPointsCore_Ref(CeedOperat CeedCallBackend(CeedVectorRestoreArray(*assembled, &assembled_array)); // Cleanup + CeedCallBackend(CeedDestroy(&ceed)); CeedCallBackend(CeedVectorDestroy(&point_coords)); CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points)); + CeedCallBackend(CeedQFunctionDestroy(&qf)); return CEED_ERROR_SUCCESS; } @@ -1109,8 +1325,414 @@ static int CeedOperatorLinearAssembleQFunctionAtPointsUpdate_Ref(CeedOperator op } //------------------------------------------------------------------------------ -// Assemble Operator +// Assemble Operator Diagonal AtPoints +//------------------------------------------------------------------------------ +static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Ref(CeedOperator op, CeedVector assembled, CeedRequest *request) { + CeedInt num_points_offset = 0, num_input_fields, num_output_fields, num_elem, num_comp_active = 1; + CeedScalar *e_data[2 * CEED_FIELD_MAX] = {0}; + Ceed ceed; + CeedVector point_coords = NULL, in_vec, out_vec; + CeedElemRestriction rstr_points = NULL; + CeedQFunctionField *qf_input_fields, *qf_output_fields; + CeedQFunction qf; + CeedOperatorField *op_input_fields, *op_output_fields; + CeedOperator_Ref *impl; + + CeedCallBackend(CeedOperatorGetData(op, &impl)); + CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem)); + CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); + CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); + CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); + + // Setup + CeedCallBackend(CeedOperatorSetupAtPoints_Ref(op)); + + // Ceed + { + Ceed ceed_parent; + + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); + CeedCallBackend(CeedGetParent(ceed, &ceed_parent)); + CeedCallBackend(CeedReferenceCopy(ceed_parent, &ceed)); + CeedCallBackend(CeedDestroy(&ceed_parent)); + } + + // Point coordinates + CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, &point_coords)); + + // Input and output vectors + { + CeedSize input_size, output_size; + + CeedCallBackend(CeedOperatorGetActiveVectorLengths(op, &input_size, &output_size)); + CeedCallBackend(CeedVectorCreate(ceed, input_size, &in_vec)); + CeedCallBackend(CeedVectorCreate(ceed, output_size, &out_vec)); + CeedCallBackend(CeedVectorSetValue(out_vec, 0.0)); + } + + // Clear input Evecs + for (CeedInt i = 0; i < num_input_fields; i++) { + bool is_active; + CeedVector vec; + + CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); + is_active = vec == CEED_VECTOR_ACTIVE; + CeedCallBackend(CeedVectorDestroy(&vec)); + if (!is_active || impl->skip_rstr_in[i]) continue; + CeedCallBackend(CeedVectorSetValue(impl->e_vecs_in[i], 0.0)); + } + + // Input Evecs and Restriction + CeedCallBackend(CeedOperatorSetupInputs_Ref(num_input_fields, qf_input_fields, op_input_fields, NULL, true, e_data, impl, request)); + + // Loop through elements + for (CeedInt e = 0; e < num_elem; e++) { + CeedInt num_points, e_vec_size = 0; + + // Setup points for element + CeedCallBackend(CeedElemRestrictionApplyAtPointsInElement(rstr_points, e, CEED_NOTRANSPOSE, point_coords, impl->point_coords_elem, request)); + CeedCallBackend(CeedElemRestrictionGetNumPointsInElement(rstr_points, e, &num_points)); + if (num_points < 1) continue; + + // Input basis apply for non-active bases + CeedCallBackend(CeedOperatorInputBasisAtPoints_Ref(e, num_points_offset, num_points, qf_input_fields, op_input_fields, num_input_fields, in_vec, + impl->point_coords_elem, true, false, e_data, impl, request)); + + // Loop over points on element + for (CeedInt i = 0; i < num_input_fields; i++) { + bool is_active_at_points = true, is_active; + CeedInt elem_size_active = 1; + CeedRestrictionType rstr_type; + CeedVector vec; + CeedElemRestriction elem_rstr; + + // -- Skip non-active input + CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); + is_active = vec == CEED_VECTOR_ACTIVE; + CeedCallBackend(CeedVectorDestroy(&vec)); + if (!is_active || impl->skip_rstr_in[i]) continue; + + // -- Get active restriction type + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr)); + CeedCallBackend(CeedElemRestrictionGetType(elem_rstr, &rstr_type)); + is_active_at_points = rstr_type == CEED_RESTRICTION_POINTS; + if (!is_active_at_points) CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size_active)); + else elem_size_active = num_points; + CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp_active)); + CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); + + e_vec_size = elem_size_active * num_comp_active; + for (CeedInt s = 0; s < e_vec_size; s++) { + // -- Update unit vector + { + CeedScalar *array; + + CeedCallBackend(CeedVectorGetArray(impl->e_vecs_in[i], CEED_MEM_HOST, &array)); + array[s] = 1.0; + if (s > 0) array[s - 1] = 0.0; + CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs_in[i], &array)); + } + // Input basis apply for active bases + CeedCallBackend(CeedOperatorInputBasisAtPoints_Ref(e, num_points_offset, num_points, qf_input_fields, op_input_fields, num_input_fields, + in_vec, impl->point_coords_elem, false, true, e_data, impl, request)); + + // -- Q function + if (!impl->is_identity_qf) { + CeedCallBackend(CeedQFunctionApply(qf, num_points, impl->q_vecs_in, impl->q_vecs_out)); + } + + // -- Output basis apply and restriction + CeedCallBackend(CeedOperatorOutputBasisAtPoints_Ref(e, num_points_offset, num_points, qf_output_fields, op_output_fields, num_input_fields, + num_output_fields, impl->apply_add_basis_out, impl->skip_rstr_out, op, out_vec, + impl->point_coords_elem, true, impl, request)); + + // -- Grab diagonal value + for (CeedInt j = 0; j < num_output_fields; j++) { + bool is_active; + CeedInt elem_size = 0; + CeedRestrictionType rstr_type; + CeedVector vec; + CeedElemRestriction elem_rstr; + + // ---- Skip non-active output + CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[j], &vec)); + is_active = vec == CEED_VECTOR_ACTIVE; + CeedCallBackend(CeedVectorDestroy(&vec)); + if (!is_active || impl->skip_rstr_out[j]) continue; + + // ---- Check if elem size matches + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[j], &elem_rstr)); + CeedCallBackend(CeedElemRestrictionGetType(elem_rstr, &rstr_type)); + if (is_active_at_points && rstr_type != CEED_RESTRICTION_POINTS) { + CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); + continue; + } + if (rstr_type == CEED_RESTRICTION_POINTS) { + CeedCallBackend(CeedElemRestrictionGetNumPointsInElement(elem_rstr, e, &elem_size)); + } else { + CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); + } + { + CeedInt num_comp = 0; + + CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); + if (e_vec_size != num_comp * elem_size) { + CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); + continue; + } + } + // ---- Update output vector + { + CeedScalar *array, current_value = 0.0; + + CeedCallBackend(CeedVectorGetArray(impl->e_vecs_out[j], CEED_MEM_HOST, &array)); + current_value = array[s]; + CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs_out[j], &array)); + CeedCallBackend(CeedVectorSetValue(impl->e_vecs_out[j], 0.0)); + CeedCallBackend(CeedVectorGetArray(impl->e_vecs_out[j], CEED_MEM_HOST, &array)); + array[s] = current_value; + CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs_out[j], &array)); + } + // ---- Restrict output block + if (rstr_type == CEED_RESTRICTION_POINTS) { + CeedCallBackend(CeedElemRestrictionApplyAtPointsInElement(elem_rstr, e, CEED_TRANSPOSE, impl->e_vecs_out[j], assembled, request)); + } else { + CeedCallBackend(CeedElemRestrictionApplyBlock(elem_rstr, e, CEED_TRANSPOSE, impl->e_vecs_out[j], assembled, request)); + } + CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); + } + // -- Reset unit vector + if (s == e_vec_size - 1) { + CeedScalar *array; + + CeedCallBackend(CeedVectorGetArray(impl->e_vecs_in[i], CEED_MEM_HOST, &array)); + array[s] = 0.0; + CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs_in[i], &array)); + } + } + } + num_points_offset += num_points; + } + + // Restore input arrays + CeedCallBackend(CeedOperatorRestoreInputs_Ref(num_input_fields, qf_input_fields, op_input_fields, true, e_data, impl)); + + // Cleanup + CeedCallBackend(CeedDestroy(&ceed)); + CeedCallBackend(CeedVectorDestroy(&in_vec)); + CeedCallBackend(CeedVectorDestroy(&out_vec)); + CeedCallBackend(CeedVectorDestroy(&point_coords)); + CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points)); + CeedCallBackend(CeedQFunctionDestroy(&qf)); + return CEED_ERROR_SUCCESS; +} + +//------------------------------------------------------------------------------ +// Assemble Operator AtPoints //------------------------------------------------------------------------------ +static int CeedOperatorAssembleSingleAtPoints_Ref(CeedOperator op, CeedInt offset, CeedVector values) { + CeedInt num_points_offset = 0, num_input_fields, num_output_fields, num_elem, num_comp_active = 1; + CeedScalar *e_data[2 * CEED_FIELD_MAX] = {0}, *assembled; + Ceed ceed; + CeedVector point_coords = NULL, in_vec, out_vec; + CeedElemRestriction rstr_points = NULL; + CeedQFunctionField *qf_input_fields, *qf_output_fields; + CeedQFunction qf; + CeedOperatorField *op_input_fields, *op_output_fields; + CeedOperator_Ref *impl; + + CeedCallBackend(CeedOperatorGetData(op, &impl)); + CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem)); + CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); + CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); + CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); + + // Setup + CeedCallBackend(CeedOperatorSetupAtPoints_Ref(op)); + + // Ceed + { + Ceed ceed_parent; + + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); + CeedCallBackend(CeedGetParent(ceed, &ceed_parent)); + CeedCallBackend(CeedReferenceCopy(ceed_parent, &ceed)); + CeedCallBackend(CeedDestroy(&ceed_parent)); + } + + // Point coordinates + CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, &point_coords)); + + // Input and output vectors + { + CeedSize input_size, output_size; + + CeedCallBackend(CeedOperatorGetActiveVectorLengths(op, &input_size, &output_size)); + CeedCallBackend(CeedVectorCreate(ceed, input_size, &in_vec)); + CeedCallBackend(CeedVectorCreate(ceed, output_size, &out_vec)); + CeedCallBackend(CeedVectorSetValue(out_vec, 0.0)); + } + + // Assembled array + CeedCallBackend(CeedVectorGetArray(values, CEED_MEM_HOST, &assembled)); + + // Clear input Evecs + for (CeedInt i = 0; i < num_input_fields; i++) { + bool is_active; + CeedVector vec; + + CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); + is_active = vec == CEED_VECTOR_ACTIVE; + CeedCallBackend(CeedVectorDestroy(&vec)); + if (!is_active || impl->skip_rstr_in[i]) continue; + CeedCallBackend(CeedVectorSetValue(impl->e_vecs_in[i], 0.0)); + } + + // Input Evecs and Restriction + CeedCallBackend(CeedOperatorSetupInputs_Ref(num_input_fields, qf_input_fields, op_input_fields, NULL, true, e_data, impl, CEED_REQUEST_IMMEDIATE)); + + // Loop through elements + for (CeedInt e = 0; e < num_elem; e++) { + CeedInt num_points, e_vec_size = 0; + + // Setup points for element + CeedCallBackend(CeedElemRestrictionApplyAtPointsInElement(rstr_points, e, CEED_NOTRANSPOSE, point_coords, impl->point_coords_elem, + CEED_REQUEST_IMMEDIATE)); + CeedCallBackend(CeedElemRestrictionGetNumPointsInElement(rstr_points, e, &num_points)); + if (num_points < 1) continue; + + // Input basis apply for non-active bases + CeedCallBackend(CeedOperatorInputBasisAtPoints_Ref(e, num_points_offset, num_points, qf_input_fields, op_input_fields, num_input_fields, in_vec, + impl->point_coords_elem, true, false, e_data, impl, CEED_REQUEST_IMMEDIATE)); + + // Loop over points on element + for (CeedInt i = 0; i < num_input_fields; i++) { + bool is_active_at_points = true, is_active; + CeedInt elem_size_active = 1; + CeedRestrictionType rstr_type; + CeedVector vec; + CeedElemRestriction elem_rstr; + + // -- Skip non-active input + CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); + is_active = vec == CEED_VECTOR_ACTIVE; + CeedCallBackend(CeedVectorDestroy(&vec)); + if (!is_active || impl->skip_rstr_in[i]) continue; + + // -- Get active restriction type + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr)); + CeedCallBackend(CeedElemRestrictionGetType(elem_rstr, &rstr_type)); + is_active_at_points = rstr_type == CEED_RESTRICTION_POINTS; + if (!is_active_at_points) CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size_active)); + else elem_size_active = num_points; + CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp_active)); + CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); + + e_vec_size = elem_size_active * num_comp_active; + for (CeedInt s = 0; s < e_vec_size; s++) { + const CeedInt comp_in = s / elem_size_active; + const CeedInt node_in = s % elem_size_active; + + // -- Update unit vector + { + CeedScalar *array; + + if (s == 0) CeedCallBackend(CeedVectorSetValue(impl->e_vecs_in[i], 0.0)); + CeedCallBackend(CeedVectorGetArray(impl->e_vecs_in[i], CEED_MEM_HOST, &array)); + array[s] = 1.0; + if (s > 0) array[s - 1] = 0.0; + CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs_in[i], &array)); + } + // Input basis apply for active bases + CeedCallBackend(CeedOperatorInputBasisAtPoints_Ref(e, num_points_offset, num_points, qf_input_fields, op_input_fields, num_input_fields, + in_vec, impl->point_coords_elem, false, true, e_data, impl, CEED_REQUEST_IMMEDIATE)); + + // -- Q function + if (!impl->is_identity_qf) { + CeedCallBackend(CeedQFunctionApply(qf, num_points, impl->q_vecs_in, impl->q_vecs_out)); + } + + // -- Output basis apply and restriction + CeedCallBackend(CeedOperatorOutputBasisAtPoints_Ref(e, num_points_offset, num_points, qf_output_fields, op_output_fields, num_input_fields, + num_output_fields, impl->apply_add_basis_out, impl->skip_rstr_out, op, out_vec, + impl->point_coords_elem, true, impl, CEED_REQUEST_IMMEDIATE)); + + // -- Build element matrix + for (CeedInt j = 0; j < num_output_fields; j++) { + bool is_active; + CeedInt elem_size = 0; + CeedRestrictionType rstr_type; + CeedVector vec; + CeedElemRestriction elem_rstr; + + // ---- Skip non-active output + CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[j], &vec)); + is_active = vec == CEED_VECTOR_ACTIVE; + CeedCallBackend(CeedVectorDestroy(&vec)); + if (!is_active || impl->skip_rstr_out[j]) continue; + + // ---- Check if elem size matches + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[j], &elem_rstr)); + CeedCallBackend(CeedElemRestrictionGetType(elem_rstr, &rstr_type)); + if (is_active_at_points && rstr_type != CEED_RESTRICTION_POINTS) { + CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); + continue; + } + if (rstr_type == CEED_RESTRICTION_POINTS) { + CeedCallBackend(CeedElemRestrictionGetNumPointsInElement(elem_rstr, e, &elem_size)); + } else { + CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); + } + { + CeedInt num_comp = 0; + + CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); + CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); + if (e_vec_size != num_comp * elem_size) continue; + } + // ---- Copy output + { + const CeedScalar *output; + + CeedCallBackend(CeedVectorGetArrayRead(impl->e_vecs_out[j], CEED_MEM_HOST, &output)); + for (CeedInt k = 0; k < e_vec_size; k++) { + const CeedInt comp_out = k / elem_size_active; + const CeedInt node_out = k % elem_size_active; + + assembled[offset + e * e_vec_size * e_vec_size + (comp_in * num_comp_active + comp_out) * elem_size_active * elem_size_active + + node_out * elem_size_active + node_in] = output[k]; + } + CeedCallBackend(CeedVectorRestoreArrayRead(impl->e_vecs_out[j], &output)); + } + } + // -- Reset unit vector + if (s == e_vec_size - 1) { + CeedScalar *array; + + CeedCallBackend(CeedVectorGetArray(impl->e_vecs_in[i], CEED_MEM_HOST, &array)); + array[s] = 0.0; + CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs_in[i], &array)); + } + } + } + num_points_offset += num_points; + } + + // Restore input arrays + CeedCallBackend(CeedOperatorRestoreInputs_Ref(num_input_fields, qf_input_fields, op_input_fields, true, e_data, impl)); + + // Restore assembled values + CeedCallBackend(CeedVectorRestoreArray(values, &assembled)); + + // Cleanup + CeedCallBackend(CeedDestroy(&ceed)); + CeedCallBackend(CeedVectorDestroy(&in_vec)); + CeedCallBackend(CeedVectorDestroy(&out_vec)); + CeedCallBackend(CeedVectorDestroy(&point_coords)); + CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points)); + CeedCallBackend(CeedQFunctionDestroy(&qf)); + return CEED_ERROR_SUCCESS; +} //------------------------------------------------------------------------------ // Operator Destroy @@ -1119,6 +1741,10 @@ static int CeedOperatorDestroy_Ref(CeedOperator op) { CeedOperator_Ref *impl; CeedCallBackend(CeedOperatorGetData(op, &impl)); + CeedCallBackend(CeedFree(&impl->skip_rstr_in)); + CeedCallBackend(CeedFree(&impl->skip_rstr_out)); + CeedCallBackend(CeedFree(&impl->e_data_out_indices)); + CeedCallBackend(CeedFree(&impl->apply_add_basis_out)); for (CeedInt i = 0; i < impl->num_inputs + impl->num_outputs; i++) { CeedCallBackend(CeedVectorDestroy(&impl->e_vecs_full[i])); } @@ -1140,12 +1766,6 @@ static int CeedOperatorDestroy_Ref(CeedOperator op) { CeedCallBackend(CeedFree(&impl->q_vecs_out)); CeedCallBackend(CeedVectorDestroy(&impl->point_coords_elem)); - // QFunction assembly - for (CeedInt i = 0; i < impl->num_active_in; i++) { - CeedCallBackend(CeedVectorDestroy(&impl->qf_active_in[i])); - } - CeedCallBackend(CeedFree(&impl->qf_active_in)); - CeedCallBackend(CeedFree(&impl)); return CEED_ERROR_SUCCESS; } @@ -1164,6 +1784,7 @@ int CeedOperatorCreate_Ref(CeedOperator op) { CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunctionUpdate", CeedOperatorLinearAssembleQFunctionUpdate_Ref)); CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "ApplyAdd", CeedOperatorApplyAdd_Ref)); CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "Destroy", CeedOperatorDestroy_Ref)); + CeedCallBackend(CeedDestroy(&ceed)); return CEED_ERROR_SUCCESS; } @@ -1178,10 +1799,13 @@ int CeedOperatorCreateAtPoints_Ref(CeedOperator op) { CeedCallBackend(CeedCalloc(1, &impl)); CeedCallBackend(CeedOperatorSetData(op, impl)); CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunction", CeedOperatorLinearAssembleQFunctionAtPoints_Ref)); - CeedCallBackend( - CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunctionUpdate", CeedOperatorLinearAssembleQFunctionAtPointsUpdate_Ref)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunctionUpdate", + CeedOperatorLinearAssembleQFunctionAtPointsUpdate_Ref)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleAddDiagonal", CeedOperatorLinearAssembleAddDiagonalAtPoints_Ref)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleSingle", CeedOperatorAssembleSingleAtPoints_Ref)); CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "ApplyAdd", CeedOperatorApplyAddAtPoints_Ref)); CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "Destroy", CeedOperatorDestroy_Ref)); + CeedCallBackend(CeedDestroy(&ceed)); return CEED_ERROR_SUCCESS; } diff --git a/backends/ref/ceed-ref-qfunction.c b/backends/ref/ceed-ref-qfunction.c index d2bbd07ad1..caedcbbad1 100644 --- a/backends/ref/ceed-ref-qfunction.c +++ b/backends/ref/ceed-ref-qfunction.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -71,6 +71,7 @@ int CeedQFunctionCreate_Ref(CeedQFunction qf) { CeedCallBackend(CeedQFunctionSetData(qf, impl)); CeedCallBackend(CeedSetBackendFunction(ceed, "QFunction", qf, "Apply", CeedQFunctionApply_Ref)); CeedCallBackend(CeedSetBackendFunction(ceed, "QFunction", qf, "Destroy", CeedQFunctionDestroy_Ref)); + CeedCallBackend(CeedDestroy(&ceed)); return CEED_ERROR_SUCCESS; } diff --git a/backends/ref/ceed-ref-qfunctioncontext.c b/backends/ref/ceed-ref-qfunctioncontext.c index 9fd2d013db..6c3e500560 100644 --- a/backends/ref/ceed-ref-qfunctioncontext.c +++ b/backends/ref/ceed-ref-qfunctioncontext.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -131,6 +131,7 @@ int CeedQFunctionContextCreate_Ref(CeedQFunctionContext ctx) { CeedCallBackend(CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "RestoreData", CeedQFunctionContextRestoreData_Ref)); CeedCallBackend(CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "RestoreDataRead", CeedQFunctionContextRestoreData_Ref)); CeedCallBackend(CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "Destroy", CeedQFunctionContextDestroy_Ref)); + CeedCallBackend(CeedDestroy(&ceed)); CeedCallBackend(CeedCalloc(1, &impl)); CeedCallBackend(CeedQFunctionContextSetBackendData(ctx, impl)); return CEED_ERROR_SUCCESS; diff --git a/backends/ref/ceed-ref-restriction.c b/backends/ref/ceed-ref-restriction.c index 08416e4d06..de65e5854b 100644 --- a/backends/ref/ceed-ref-restriction.c +++ b/backends/ref/ceed-ref-restriction.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -17,8 +17,8 @@ // Core ElemRestriction Apply Code //------------------------------------------------------------------------------ static inline int CeedElemRestrictionApplyStridedNoTranspose_Ref_Core(CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt block_size, - CeedInt start, CeedInt stop, CeedInt num_elem, CeedInt elem_size, - CeedSize v_offset, const CeedScalar *__restrict__ uu, + const CeedInt start, const CeedInt stop, const CeedInt num_elem, + const CeedInt elem_size, CeedSize v_offset, const CeedScalar *__restrict__ uu, CeedScalar *__restrict__ vv) { // No offsets provided, identity restriction bool has_backend_strides; @@ -28,8 +28,8 @@ static inline int CeedElemRestrictionApplyStridedNoTranspose_Ref_Core(CeedElemRe // CPU backend strides are {1, elem_size, elem_size*num_comp} // This if branch is left separate to allow better inlining for (CeedSize e = start * block_size; e < stop * block_size; e += block_size) { - CeedPragmaSIMD for (CeedSize k = 0; k < num_comp; k++) { - CeedPragmaSIMD for (CeedSize n = 0; n < elem_size; n++) { + for (CeedSize k = 0; k < num_comp; k++) { + for (CeedSize n = 0; n < elem_size; n++) { CeedPragmaSIMD for (CeedSize j = 0; j < block_size; j++) { vv[e * elem_size * num_comp + (k * elem_size + n) * block_size + j - v_offset] = uu[n + k * elem_size + CeedIntMin(e + j, num_elem - 1) * elem_size * (CeedSize)num_comp]; @@ -43,8 +43,8 @@ static inline int CeedElemRestrictionApplyStridedNoTranspose_Ref_Core(CeedElemRe CeedCallBackend(CeedElemRestrictionGetStrides(rstr, strides)); for (CeedSize e = start * block_size; e < stop * block_size; e += block_size) { - CeedPragmaSIMD for (CeedSize k = 0; k < num_comp; k++) { - CeedPragmaSIMD for (CeedSize n = 0; n < elem_size; n++) { + for (CeedSize k = 0; k < num_comp; k++) { + for (CeedSize n = 0; n < elem_size; n++) { CeedPragmaSIMD for (CeedSize j = 0; j < block_size; j++) { vv[e * elem_size * num_comp + (k * elem_size + n) * block_size + j - v_offset] = uu[n * strides[0] + k * strides[1] + CeedIntMin(e + j, num_elem - 1) * (CeedSize)strides[2]]; @@ -57,15 +57,15 @@ static inline int CeedElemRestrictionApplyStridedNoTranspose_Ref_Core(CeedElemRe } static inline int CeedElemRestrictionApplyOffsetNoTranspose_Ref_Core(CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt block_size, - const CeedInt comp_stride, CeedInt start, CeedInt stop, CeedInt num_elem, - CeedInt elem_size, CeedSize v_offset, const CeedScalar *__restrict__ uu, - CeedScalar *__restrict__ vv) { + const CeedInt comp_stride, const CeedInt start, const CeedInt stop, + const CeedInt num_elem, const CeedInt elem_size, const CeedSize v_offset, + const CeedScalar *__restrict__ uu, CeedScalar *__restrict__ vv) { // Default restriction with offsets CeedElemRestriction_Ref *impl; CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); for (CeedSize e = start * block_size; e < stop * block_size; e += block_size) { - CeedPragmaSIMD for (CeedSize k = 0; k < num_comp; k++) { + for (CeedSize k = 0; k < num_comp; k++) { CeedPragmaSIMD for (CeedSize i = 0; i < elem_size * block_size; i++) { vv[elem_size * (k * block_size + e * num_comp) + i - v_offset] = uu[impl->offsets[i + e * elem_size] + k * comp_stride]; } @@ -75,15 +75,15 @@ static inline int CeedElemRestrictionApplyOffsetNoTranspose_Ref_Core(CeedElemRes } static inline int CeedElemRestrictionApplyOrientedNoTranspose_Ref_Core(CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt block_size, - const CeedInt comp_stride, CeedInt start, CeedInt stop, CeedInt num_elem, - CeedInt elem_size, CeedSize v_offset, const CeedScalar *__restrict__ uu, - CeedScalar *__restrict__ vv) { + const CeedInt comp_stride, const CeedInt start, const CeedInt stop, + const CeedInt num_elem, const CeedInt elem_size, const CeedSize v_offset, + const CeedScalar *__restrict__ uu, CeedScalar *__restrict__ vv) { // Restriction with orientations CeedElemRestriction_Ref *impl; CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); for (CeedSize e = start * block_size; e < stop * block_size; e += block_size) { - CeedPragmaSIMD for (CeedSize k = 0; k < num_comp; k++) { + for (CeedSize k = 0; k < num_comp; k++) { CeedPragmaSIMD for (CeedSize i = 0; i < elem_size * block_size; i++) { vv[elem_size * (k * block_size + e * num_comp) + i - v_offset] = uu[impl->offsets[i + e * elem_size] + k * comp_stride] * (impl->orients[i + e * elem_size] ? -1.0 : 1.0); @@ -94,15 +94,15 @@ static inline int CeedElemRestrictionApplyOrientedNoTranspose_Ref_Core(CeedElemR } static inline int CeedElemRestrictionApplyCurlOrientedNoTranspose_Ref_Core(CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt block_size, - const CeedInt comp_stride, CeedInt start, CeedInt stop, CeedInt num_elem, - CeedInt elem_size, CeedSize v_offset, const CeedScalar *__restrict__ uu, - CeedScalar *__restrict__ vv) { + const CeedInt comp_stride, const CeedInt start, const CeedInt stop, + const CeedInt num_elem, const CeedInt elem_size, const CeedSize v_offset, + const CeedScalar *__restrict__ uu, CeedScalar *__restrict__ vv) { // Restriction with tridiagonal transformation CeedElemRestriction_Ref *impl; CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); for (CeedSize e = start * block_size; e < stop * block_size; e += block_size) { - CeedPragmaSIMD for (CeedSize k = 0; k < num_comp; k++) { + for (CeedSize k = 0; k < num_comp; k++) { CeedSize n = 0; CeedPragmaSIMD for (CeedSize j = 0; j < block_size; j++) { @@ -112,7 +112,7 @@ static inline int CeedElemRestrictionApplyCurlOrientedNoTranspose_Ref_Core(CeedE uu[impl->offsets[j + (n + 1) * block_size + e * elem_size] + k * comp_stride] * impl->curl_orients[j + (3 * n + 2) * block_size + e * 3 * elem_size]; } - CeedPragmaSIMD for (n = 1; n < elem_size - 1; n++) { + for (n = 1; n < elem_size - 1; n++) { CeedPragmaSIMD for (CeedSize j = 0; j < block_size; j++) { vv[e * elem_size * num_comp + (k * elem_size + n) * block_size + j - v_offset] = uu[impl->offsets[j + (n - 1) * block_size + e * elem_size] + k * comp_stride] * @@ -136,16 +136,16 @@ static inline int CeedElemRestrictionApplyCurlOrientedNoTranspose_Ref_Core(CeedE } static inline int CeedElemRestrictionApplyCurlOrientedUnsignedNoTranspose_Ref_Core(CeedElemRestriction rstr, const CeedInt num_comp, - const CeedInt block_size, const CeedInt comp_stride, CeedInt start, - CeedInt stop, CeedInt num_elem, CeedInt elem_size, - CeedSize v_offset, const CeedScalar *__restrict__ uu, - CeedScalar *__restrict__ vv) { + const CeedInt block_size, const CeedInt comp_stride, + const CeedInt start, const CeedInt stop, const CeedInt num_elem, + const CeedInt elem_size, const CeedSize v_offset, + const CeedScalar *__restrict__ uu, CeedScalar *__restrict__ vv) { // Restriction with (unsigned) tridiagonal transformation CeedElemRestriction_Ref *impl; CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); for (CeedSize e = start * block_size; e < stop * block_size; e += block_size) { - CeedPragmaSIMD for (CeedSize k = 0; k < num_comp; k++) { + for (CeedSize k = 0; k < num_comp; k++) { CeedSize n = 0; CeedPragmaSIMD for (CeedSize j = 0; j < block_size; j++) { @@ -155,7 +155,7 @@ static inline int CeedElemRestrictionApplyCurlOrientedUnsignedNoTranspose_Ref_Co uu[impl->offsets[j + (n + 1) * block_size + e * elem_size] + k * comp_stride] * abs(impl->curl_orients[j + (3 * n + 2) * block_size + e * 3 * elem_size]); } - CeedPragmaSIMD for (n = 1; n < elem_size - 1; n++) { + for (n = 1; n < elem_size - 1; n++) { CeedPragmaSIMD for (CeedSize j = 0; j < block_size; j++) { vv[e * elem_size * num_comp + (k * elem_size + n) * block_size + j - v_offset] = uu[impl->offsets[j + (n - 1) * block_size + e * elem_size] + k * comp_stride] * @@ -179,9 +179,9 @@ static inline int CeedElemRestrictionApplyCurlOrientedUnsignedNoTranspose_Ref_Co } static inline int CeedElemRestrictionApplyStridedTranspose_Ref_Core(CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt block_size, - CeedInt start, CeedInt stop, CeedInt num_elem, CeedInt elem_size, - CeedSize v_offset, const CeedScalar *__restrict__ uu, - CeedScalar *__restrict__ vv) { + const CeedInt start, const CeedInt stop, const CeedInt num_elem, + const CeedInt elem_size, const CeedSize v_offset, + const CeedScalar *__restrict__ uu, CeedScalar *__restrict__ vv) { // No offsets provided, identity restriction bool has_backend_strides; @@ -190,8 +190,8 @@ static inline int CeedElemRestrictionApplyStridedTranspose_Ref_Core(CeedElemRest // CPU backend strides are {1, elem_size, elem_size*num_comp} // This if brach is left separate to allow better inlining for (CeedSize e = start * block_size; e < stop * block_size; e += block_size) { - CeedPragmaSIMD for (CeedSize k = 0; k < num_comp; k++) { - CeedPragmaSIMD for (CeedSize n = 0; n < elem_size; n++) { + for (CeedSize k = 0; k < num_comp; k++) { + for (CeedSize n = 0; n < elem_size; n++) { CeedPragmaSIMD for (CeedSize j = 0; j < CeedIntMin(block_size, num_elem - e); j++) { vv[n + k * elem_size + (e + j) * elem_size * num_comp] += uu[e * elem_size * num_comp + (k * elem_size + n) * block_size + j - v_offset]; } @@ -204,8 +204,8 @@ static inline int CeedElemRestrictionApplyStridedTranspose_Ref_Core(CeedElemRest CeedCallBackend(CeedElemRestrictionGetStrides(rstr, strides)); for (CeedInt e = start * block_size; e < stop * block_size; e += block_size) { - CeedPragmaSIMD for (CeedSize k = 0; k < num_comp; k++) { - CeedPragmaSIMD for (CeedSize n = 0; n < elem_size; n++) { + for (CeedSize k = 0; k < num_comp; k++) { + for (CeedSize n = 0; n < elem_size; n++) { CeedPragmaSIMD for (CeedSize j = 0; j < CeedIntMin(block_size, num_elem - e); j++) { vv[n * strides[0] + k * strides[1] + (e + j) * strides[2]] += uu[e * elem_size * num_comp + (k * elem_size + n) * block_size + j - v_offset]; @@ -218,9 +218,9 @@ static inline int CeedElemRestrictionApplyStridedTranspose_Ref_Core(CeedElemRest } static inline int CeedElemRestrictionApplyOffsetTranspose_Ref_Core(CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt block_size, - const CeedInt comp_stride, CeedInt start, CeedInt stop, CeedInt num_elem, - CeedInt elem_size, CeedSize v_offset, const CeedScalar *__restrict__ uu, - CeedScalar *__restrict__ vv) { + const CeedInt comp_stride, const CeedInt start, const CeedInt stop, + const CeedInt num_elem, const CeedInt elem_size, const CeedSize v_offset, + const CeedScalar *__restrict__ uu, CeedScalar *__restrict__ vv) { // Default restriction with offsets CeedElemRestriction_Ref *impl; @@ -242,9 +242,9 @@ static inline int CeedElemRestrictionApplyOffsetTranspose_Ref_Core(CeedElemRestr } static inline int CeedElemRestrictionApplyOrientedTranspose_Ref_Core(CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt block_size, - const CeedInt comp_stride, CeedInt start, CeedInt stop, CeedInt num_elem, - CeedInt elem_size, CeedSize v_offset, const CeedScalar *__restrict__ uu, - CeedScalar *__restrict__ vv) { + const CeedInt comp_stride, const CeedInt start, const CeedInt stop, + const CeedInt num_elem, const CeedInt elem_size, const CeedSize v_offset, + const CeedScalar *__restrict__ uu, CeedScalar *__restrict__ vv) { // Restriction with orientations CeedElemRestriction_Ref *impl; @@ -266,9 +266,9 @@ static inline int CeedElemRestrictionApplyOrientedTranspose_Ref_Core(CeedElemRes } static inline int CeedElemRestrictionApplyCurlOrientedTranspose_Ref_Core(CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt block_size, - const CeedInt comp_stride, CeedInt start, CeedInt stop, CeedInt num_elem, - CeedInt elem_size, CeedSize v_offset, const CeedScalar *__restrict__ uu, - CeedScalar *__restrict__ vv) { + const CeedInt comp_stride, const CeedInt start, const CeedInt stop, + const CeedInt num_elem, const CeedInt elem_size, const CeedSize v_offset, + const CeedScalar *__restrict__ uu, CeedScalar *__restrict__ vv) { // Restriction with tridiagonal transformation CeedElemRestriction_Ref *impl; CeedScalar vv_loc[block_size]; @@ -317,8 +317,9 @@ static inline int CeedElemRestrictionApplyCurlOrientedTranspose_Ref_Core(CeedEle } static inline int CeedElemRestrictionApplyCurlOrientedUnsignedTranspose_Ref_Core(CeedElemRestriction rstr, const CeedInt num_comp, - const CeedInt block_size, const CeedInt comp_stride, CeedInt start, - CeedInt stop, CeedInt num_elem, CeedInt elem_size, CeedSize v_offset, + const CeedInt block_size, const CeedInt comp_stride, + const CeedInt start, const CeedInt stop, const CeedInt num_elem, + const CeedInt elem_size, const CeedSize v_offset, const CeedScalar *__restrict__ uu, CeedScalar *__restrict__ vv) { // Restriction with (unsigned) tridiagonal transformation CeedElemRestriction_Ref *impl; @@ -367,8 +368,8 @@ static inline int CeedElemRestrictionApplyCurlOrientedUnsignedTranspose_Ref_Core return CEED_ERROR_SUCCESS; } -static inline int CeedElemRestrictionApplyAtPointsInElement_Ref_Core(CeedElemRestriction rstr, const CeedInt num_comp, CeedInt start, CeedInt stop, - CeedTransposeMode t_mode, const CeedScalar *__restrict__ uu, +static inline int CeedElemRestrictionApplyAtPointsInElement_Ref_Core(CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt start, + const CeedInt stop, CeedTransposeMode t_mode, const CeedScalar *__restrict__ uu, CeedScalar *__restrict__ vv) { CeedInt num_points, l_vec_offset; CeedSize e_vec_offset = 0; @@ -384,7 +385,7 @@ static inline int CeedElemRestrictionApplyAtPointsInElement_Ref_Core(CeedElemRes } } else { for (CeedSize i = 0; i < num_points; i++) { - for (CeedSize j = 0; j < num_comp; j++) vv[impl->offsets[i + l_vec_offset] * num_comp + j] = uu[j * num_points + i + e_vec_offset]; + for (CeedSize j = 0; j < num_comp; j++) vv[impl->offsets[i + l_vec_offset] * num_comp + j] += uu[j * num_points + i + e_vec_offset]; } } e_vec_offset += num_points * (CeedSize)num_comp; @@ -393,8 +394,8 @@ static inline int CeedElemRestrictionApplyAtPointsInElement_Ref_Core(CeedElemRes } static inline int CeedElemRestrictionApply_Ref_Core(CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt block_size, - const CeedInt comp_stride, CeedInt start, CeedInt stop, CeedTransposeMode t_mode, bool use_signs, - bool use_orients, CeedVector u, CeedVector v, CeedRequest *request) { + const CeedInt comp_stride, const CeedInt start, const CeedInt stop, CeedTransposeMode t_mode, + bool use_signs, bool use_orients, CeedVector u, CeedVector v, CeedRequest *request) { CeedInt num_elem, elem_size; CeedSize v_offset = 0; CeedRestrictionType rstr_type; @@ -422,8 +423,8 @@ static inline int CeedElemRestrictionApply_Ref_Core(CeedElemRestriction rstr, co // Sum into for transpose mode switch (rstr_type) { case CEED_RESTRICTION_STRIDED: - CeedCallBackend( - CeedElemRestrictionApplyStridedTranspose_Ref_Core(rstr, num_comp, block_size, start, stop, num_elem, elem_size, v_offset, uu, vv)); + CeedCallBackend(CeedElemRestrictionApplyStridedTranspose_Ref_Core(rstr, num_comp, block_size, start, stop, num_elem, elem_size, v_offset, uu, + vv)); break; case CEED_RESTRICTION_STANDARD: CeedCallBackend(CeedElemRestrictionApplyOffsetTranspose_Ref_Core(rstr, num_comp, block_size, comp_stride, start, stop, num_elem, elem_size, @@ -462,8 +463,8 @@ static inline int CeedElemRestrictionApply_Ref_Core(CeedElemRestriction rstr, co // Overwrite for notranspose mode switch (rstr_type) { case CEED_RESTRICTION_STRIDED: - CeedCallBackend( - CeedElemRestrictionApplyStridedNoTranspose_Ref_Core(rstr, num_comp, block_size, start, stop, num_elem, elem_size, v_offset, uu, vv)); + CeedCallBackend(CeedElemRestrictionApplyStridedNoTranspose_Ref_Core(rstr, num_comp, block_size, start, stop, num_elem, elem_size, v_offset, + uu, vv)); break; case CEED_RESTRICTION_STANDARD: CeedCallBackend(CeedElemRestrictionApplyOffsetNoTranspose_Ref_Core(rstr, num_comp, block_size, comp_stride, start, stop, num_elem, elem_size, @@ -553,6 +554,30 @@ static int CeedElemRestrictionApply_Ref_381(CeedElemRestriction rstr, const Ceed } // LCOV_EXCL_START +static int CeedElemRestrictionApply_Ref_410(CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt block_size, const CeedInt comp_stride, + CeedInt start, CeedInt stop, CeedTransposeMode t_mode, bool use_signs, bool use_orients, CeedVector u, + CeedVector v, CeedRequest *request) { + return CeedElemRestrictionApply_Ref_Core(rstr, 4, 1, comp_stride, start, stop, t_mode, use_signs, use_orients, u, v, request); +} + +static int CeedElemRestrictionApply_Ref_411(CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt block_size, const CeedInt comp_stride, + CeedInt start, CeedInt stop, CeedTransposeMode t_mode, bool use_signs, bool use_orients, CeedVector u, + CeedVector v, CeedRequest *request) { + return CeedElemRestrictionApply_Ref_Core(rstr, 4, 1, 1, start, stop, t_mode, use_signs, use_orients, u, v, request); +} + +static int CeedElemRestrictionApply_Ref_480(CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt block_size, const CeedInt comp_stride, + CeedInt start, CeedInt stop, CeedTransposeMode t_mode, bool use_signs, bool use_orients, CeedVector u, + CeedVector v, CeedRequest *request) { + return CeedElemRestrictionApply_Ref_Core(rstr, 4, 8, comp_stride, start, stop, t_mode, use_signs, use_orients, u, v, request); +} + +static int CeedElemRestrictionApply_Ref_481(CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt block_size, const CeedInt comp_stride, + CeedInt start, CeedInt stop, CeedTransposeMode t_mode, bool use_signs, bool use_orients, CeedVector u, + CeedVector v, CeedRequest *request) { + return CeedElemRestrictionApply_Ref_Core(rstr, 4, 8, 1, start, stop, t_mode, use_signs, use_orients, u, v, request); +} + static int CeedElemRestrictionApply_Ref_510(CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt block_size, const CeedInt comp_stride, CeedInt start, CeedInt stop, CeedTransposeMode t_mode, bool use_signs, bool use_orients, CeedVector u, CeedVector v, CeedRequest *request) { @@ -753,20 +778,32 @@ int CeedElemRestrictionCreate_Ref(CeedMemType mem_type, CeedCopyMode copy_mode, } } + // Expand E-vector size for AtPoints + if (rstr_type == CEED_RESTRICTION_POINTS) { + CeedSize max_points = 0, num_points_total = 0; + + for (CeedInt i = 0; i < num_elem; i++) { + CeedInt num_points = offsets[i + 1] - offsets[i]; + + max_points = CeedIntMax(max_points, num_points); + num_points_total += num_points; + } + // -- Increase size for last element + num_points_total += (max_points - (offsets[num_elem] - offsets[num_elem - 1])); + CeedCallBackend(CeedElemRestrictionSetAtPointsEVectorSize(rstr, num_points_total * num_comp)); + } + // Offsets data if (rstr_type != CEED_RESTRICTION_STRIDED) { const char *resource; // Check indices for ref or memcheck backends { - Ceed current = ceed, parent = NULL; + Ceed current = ceed, ceed_parent = NULL; - CeedCallBackend(CeedGetParent(current, &parent)); - while (current != parent) { - current = parent; - CeedCallBackend(CeedGetParent(current, &parent)); - } - CeedCallBackend(CeedGetResource(parent, &resource)); + CeedCallBackend(CeedGetParent(current, &ceed_parent)); + CeedCallBackend(CeedGetResource(ceed_parent, &resource)); + CeedCallBackend(CeedDestroy(&ceed_parent)); } if (!strcmp(resource, "/cpu/self/ref/serial") || !strcmp(resource, "/cpu/self/ref/blocked")) { CeedSize l_size; @@ -824,6 +861,18 @@ int CeedElemRestrictionCreate_Ref(CeedMemType mem_type, CeedCopyMode copy_mode, impl->Apply = CeedElemRestrictionApply_Ref_381; break; // LCOV_EXCL_START + case 410: + impl->Apply = CeedElemRestrictionApply_Ref_410; + break; + case 411: + impl->Apply = CeedElemRestrictionApply_Ref_411; + break; + case 480: + impl->Apply = CeedElemRestrictionApply_Ref_480; + break; + case 481: + impl->Apply = CeedElemRestrictionApply_Ref_481; + break; case 510: impl->Apply = CeedElemRestrictionApply_Ref_510; break; @@ -856,6 +905,7 @@ int CeedElemRestrictionCreate_Ref(CeedMemType mem_type, CeedCopyMode copy_mode, CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "GetOrientations", CeedElemRestrictionGetOrientations_Ref)); CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "GetCurlOrientations", CeedElemRestrictionGetCurlOrientations_Ref)); CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "Destroy", CeedElemRestrictionDestroy_Ref)); + CeedCallBackend(CeedDestroy(&ceed)); return CEED_ERROR_SUCCESS; } diff --git a/backends/ref/ceed-ref-tensor.c b/backends/ref/ceed-ref-tensor.c index a2064cfce6..9d66a2a68d 100644 --- a/backends/ref/ceed-ref-tensor.c +++ b/backends/ref/ceed-ref-tensor.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -51,6 +51,7 @@ int CeedTensorContractCreate_Ref(CeedTensorContract contract) { CeedCallBackend(CeedTensorContractGetCeed(contract, &ceed)); CeedCallBackend(CeedSetBackendFunction(ceed, "TensorContract", contract, "Apply", CeedTensorContractApply_Ref)); CeedCallBackend(CeedSetBackendFunction(ceed, "TensorContract", contract, "Destroy", CeedTensorContractDestroy_Ref)); + CeedCallBackend(CeedDestroy(&ceed)); return CEED_ERROR_SUCCESS; } diff --git a/backends/ref/ceed-ref-vector.c b/backends/ref/ceed-ref-vector.c index f907d232c8..520afdd61a 100644 --- a/backends/ref/ceed-ref-vector.c +++ b/backends/ref/ceed-ref-vector.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -149,6 +149,7 @@ int CeedVectorCreate_Ref(CeedSize n, CeedVector vec) { CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "RestoreArray", CeedVectorRestoreArray_Ref)); CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "RestoreArrayRead", CeedVectorRestoreArrayRead_Ref)); CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "Destroy", CeedVectorDestroy_Ref)); + CeedCallBackend(CeedDestroy(&ceed)); CeedCallBackend(CeedCalloc(1, &impl)); CeedCallBackend(CeedVectorSetData(vec, impl)); return CEED_ERROR_SUCCESS; diff --git a/backends/ref/ceed-ref.c b/backends/ref/ceed-ref.c index a3c15faf8f..46af219839 100644 --- a/backends/ref/ceed-ref.c +++ b/backends/ref/ceed-ref.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause diff --git a/backends/ref/ceed-ref.h b/backends/ref/ceed-ref.h index 8eb3b54331..4af06564a5 100644 --- a/backends/ref/ceed-ref.h +++ b/backends/ref/ceed-ref.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -11,11 +11,6 @@ #include #include -typedef struct { - CeedScalar *collo_grad_1d; - bool has_collo_interp; -} CeedBasis_Ref; - typedef struct { CeedScalar *array; CeedScalar *array_borrowed; @@ -36,6 +31,11 @@ typedef struct { CeedRequest *); } CeedElemRestriction_Ref; +typedef struct { + CeedScalar *collo_grad_1d; + bool is_collocated; +} CeedBasis_Ref; + typedef struct { const CeedScalar **inputs; CeedScalar **outputs; @@ -49,15 +49,17 @@ typedef struct { typedef struct { bool is_identity_qf, is_identity_rstr_op; - CeedVector *e_vecs_full; /* Full E-vectors, inputs followed by outputs */ + bool *skip_rstr_in, *skip_rstr_out, *apply_add_basis_out; + CeedInt *e_data_out_indices; uint64_t *input_states; /* State counter of inputs */ + CeedVector *e_vecs_full; /* Full E-vectors, inputs followed by outputs */ CeedVector *e_vecs_in; /* Single element input E-vectors */ CeedVector *e_vecs_out; /* Single element output E-vectors */ CeedVector *q_vecs_in; /* Single element input Q-vectors */ CeedVector *q_vecs_out; /* Single element output Q-vectors */ CeedInt num_inputs, num_outputs; - CeedInt num_active_in, num_active_out; - CeedVector *qf_active_in, point_coords_elem; + CeedInt qf_size_in, qf_size_out; + CeedVector point_coords_elem; } CeedOperator_Ref; CEED_INTERN int CeedVectorCreate_Ref(CeedSize n, CeedVector vec); diff --git a/backends/sycl-gen/ceed-sycl-gen-operator-build.hpp b/backends/sycl-gen/ceed-sycl-gen-operator-build.hpp index ca469f8d77..b112488569 100644 --- a/backends/sycl-gen/ceed-sycl-gen-operator-build.hpp +++ b/backends/sycl-gen/ceed-sycl-gen-operator-build.hpp @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause diff --git a/backends/sycl-gen/ceed-sycl-gen-operator-build.sycl.cpp b/backends/sycl-gen/ceed-sycl-gen-operator-build.sycl.cpp index a4edb6fc2b..ec783e5cc2 100644 --- a/backends/sycl-gen/ceed-sycl-gen-operator-build.sycl.cpp +++ b/backends/sycl-gen/ceed-sycl-gen-operator-build.sycl.cpp @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -155,12 +155,12 @@ extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) { // LCOV_EXCL_STOP } } + CeedCallBackend(CeedBasisDestroy(&basis)); } // Check output bases for Q_1d, dim as well // The only input basis might be CEED_BASIS_NONE for (CeedInt i = 0; i < num_output_fields; i++) { CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); - if (basis != CEED_BASIS_NONE) { bool is_tensor; @@ -178,6 +178,7 @@ extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) { // LCOV_EXCL_STOP } } + CeedCallBackend(CeedBasisDestroy(&basis)); } impl->dim = dim; impl->Q_1d = Q_1d; @@ -196,6 +197,7 @@ extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) { CeedCallBackend(CeedBasisGetData(basis, &basis_impl)); use_collograd_parallelization = basis_impl->d_collo_grad_1d && (was_grad_found ? use_collograd_parallelization : true); was_grad_found = true; + CeedCallBackend(CeedBasisDestroy(&basis)); } } for (CeedInt i = 0; i < num_output_fields; i++) { @@ -205,6 +207,7 @@ extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) { CeedCallBackend(CeedBasisGetData(basis, &basis_impl)); use_collograd_parallelization = basis_impl->d_collo_grad_1d && (was_grad_found ? use_collograd_parallelization : true); was_grad_found = true; + CeedCallBackend(CeedBasisDestroy(&basis)); } } } @@ -271,8 +274,9 @@ extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) { // Get elem_size, eval_mode, num_comp CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr)); CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); - CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); + CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); // Set field constants if (eval_mode != CEED_EVAL_WEIGHT) { @@ -321,6 +325,7 @@ extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) { case CEED_EVAL_CURL: break; // TODO: Not implemented } + CeedCallBackend(CeedBasisDestroy(&basis)); } code << "\n // -- Output field constants and basis data --\n"; @@ -329,8 +334,9 @@ extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) { // Get elem_size, eval_mode, num_comp CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr)); CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); - CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); + CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); // Set field constants CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); @@ -382,6 +388,7 @@ extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) { } // LCOV_EXCL_STOP } + CeedCallBackend(CeedBasisDestroy(&basis)); } code << "\n // -- Element loop --\n"; code << " work_group_barrier(CLK_LOCAL_MEM_FENCE);\n"; @@ -394,8 +401,8 @@ extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) { // Get elem_size, eval_mode, num_comp CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr)); CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); - CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); // Restriction if (eval_mode != CEED_EVAL_WEIGHT && !((eval_mode == CEED_EVAL_NONE) && use_collograd_parallelization)) { @@ -431,6 +438,7 @@ extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) { << ", num_elem, d_u_" << i << ", r_u_" << i << ");\n"; } } + CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); // Basis action code << " // EvalMode: " << CeedEvalModes[eval_mode] << "\n"; @@ -452,12 +460,14 @@ extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) { << i << ", r_t_" << i << ", elem_scratch);\n"; } else { CeedInt P_1d; + CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); code << " CeedScalar r_t_" << i << "[num_comp_in_" << i << "*DIM*Q_1D];\n"; code << " Grad" << (dim > 1 ? "Tensor" : "") << (dim == 3 && Q_1d >= P_1d ? "Collocated" : "") << dim << "d(num_comp_in_" << i << ", P_in_" << i << ", Q_1D, r_u_" << i << (dim > 1 ? ", s_B_in_" : "") << (dim > 1 ? std::to_string(i) : "") << ", s_G_in_" << i << ", r_t_" << i << ", elem_scratch);\n"; + CeedCallBackend(CeedBasisDestroy(&basis)); } break; case CEED_EVAL_WEIGHT: @@ -466,6 +476,7 @@ extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) { CeedCallBackend(CeedBasisGetData(basis, &basis_impl)); impl->W = basis_impl->d_q_weight_1d; code << " Weight" << (dim > 1 ? "Tensor" : "") << dim << "d(Q_1D, W, r_t_" << i << ");\n"; + CeedCallBackend(CeedBasisDestroy(&basis)); break; // No action case CEED_EVAL_DIV: break; // TODO: Not implemented @@ -544,6 +555,7 @@ extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) { << "3d(num_comp_in_" << i << ", Q_1D," << strides[0] << ", " << strides[1] << ", " << strides[2] << ", num_elem, q, d_u_" << i << ", r_q_" << i << ");\n"; } + CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); break; case CEED_EVAL_INTERP: code << " CeedScalar r_q_" << i << "[num_comp_in_" << i << "];\n"; @@ -665,8 +677,8 @@ extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) { // Get elem_size, eval_mode, num_comp CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr)); CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); - CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); // Basis action code << " // EvalMode: " << CeedEvalModes[eval_mode] << "\n"; switch (eval_mode) { @@ -690,6 +702,7 @@ extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) { code << " GradTranspose" << (dim > 1 ? "Tensor" : "") << (dim == 3 && Q_1d >= P_1d ? "Collocated" : "") << dim << "d(num_comp_out_" << i << ", P_out_" << i << ", Q_1D, r_tt_" << i << (dim > 1 ? ", s_B_out_" : "") << (dim > 1 ? std::to_string(i) : "") << ", s_G_out_" << i << ", r_v_" << i << ", elem_scratch);\n"; + CeedCallBackend(CeedBasisDestroy(&basis)); } break; // LCOV_EXCL_START @@ -734,6 +747,7 @@ extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) { code << " writeDofsStrided" << dim << "d(num_comp_out_" << i << ",P_out_" << i << "," << strides[0] << "," << strides[1] << "," << strides[2] << ", num_elem, r_v_" << i << ", d_v_" << i << ");\n"; } + CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); } code << " }\n"; @@ -766,8 +780,9 @@ extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) { // Load kernel function CeedCallBackend(CeedGetKernel_Sycl(ceed, impl->sycl_module, operator_name, &impl->op)); - CeedCallBackend(CeedOperatorSetSetupDone(op)); + CeedCallBackend(CeedDestroy(&ceed)); + CeedCallBackend(CeedQFunctionDestroy(&qf)); return CEED_ERROR_SUCCESS; } diff --git a/backends/sycl-gen/ceed-sycl-gen-operator.sycl.cpp b/backends/sycl-gen/ceed-sycl-gen-operator.sycl.cpp index 100176b2d7..9370c98d5a 100644 --- a/backends/sycl-gen/ceed-sycl-gen-operator.sycl.cpp +++ b/backends/sycl-gen/ceed-sycl-gen-operator.sycl.cpp @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -39,15 +39,6 @@ static int CeedOperatorApplyAdd_Sycl_gen(CeedOperator op, CeedVector input_vec, CeedOperatorField *op_input_fields, *op_output_fields; CeedOperator_Sycl_gen *impl; - CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); - CeedCallBackend(CeedGetData(ceed, &ceed_Sycl)); - CeedCallBackend(CeedOperatorGetData(op, &impl)); - CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); - CeedCallBackend(CeedQFunctionGetData(qf, &qf_impl)); - CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem)); - CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); - CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); - // Check for tensor-product bases { bool has_tensor_bases; @@ -57,13 +48,22 @@ static int CeedOperatorApplyAdd_Sycl_gen(CeedOperator op, CeedVector input_vec, if (!has_tensor_bases) { CeedOperator op_fallback; - CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "Falling back to sycl/ref CeedOperator due to non-tensor bases"); + CeedDebug256(CeedOperatorReturnCeed(op), CEED_DEBUG_COLOR_SUCCESS, "Falling back to sycl/ref CeedOperator due to non-tensor bases"); CeedCallBackend(CeedOperatorGetFallback(op, &op_fallback)); CeedCallBackend(CeedOperatorApplyAdd(op_fallback, input_vec, output_vec, request)); return CEED_ERROR_SUCCESS; } } + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); + CeedCallBackend(CeedGetData(ceed, &ceed_Sycl)); + CeedCallBackend(CeedOperatorGetData(op, &impl)); + CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); + CeedCallBackend(CeedQFunctionGetData(qf, &qf_impl)); + CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem)); + CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); + CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); + // Creation of the operator CeedCallBackend(CeedOperatorBuildKernel_Sycl_gen(op)); @@ -73,12 +73,15 @@ static int CeedOperatorApplyAdd_Sycl_gen(CeedOperator op, CeedVector input_vec, if (eval_mode == CEED_EVAL_WEIGHT) { // Skip impl->fields->inputs[i] = NULL; } else { + bool is_active; CeedVector vec; // Get input vector CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); - if (vec == CEED_VECTOR_ACTIVE) vec = input_vec; + is_active = vec == CEED_VECTOR_ACTIVE; + if (is_active) vec = input_vec; CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &impl->fields->inputs[i])); + if (!is_active) CeedCallBackend(CeedVectorDestroy(&vec)); } } @@ -88,11 +91,13 @@ static int CeedOperatorApplyAdd_Sycl_gen(CeedOperator op, CeedVector input_vec, if (eval_mode == CEED_EVAL_WEIGHT) { // Skip impl->fields->outputs[i] = NULL; } else { + bool is_active; CeedVector vec; // Get output vector CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec)); - if (vec == CEED_VECTOR_ACTIVE) vec = output_vec; + is_active = vec == CEED_VECTOR_ACTIVE; + if (is_active) vec = output_vec; output_vecs[i] = vec; // Check for multiple output modes CeedInt index = -1; @@ -107,6 +112,7 @@ static int CeedOperatorApplyAdd_Sycl_gen(CeedOperator op, CeedVector input_vec, } else { impl->fields->outputs[i] = impl->fields->outputs[index]; } + if (!is_active) CeedCallBackend(CeedVectorDestroy(&vec)); } } @@ -152,11 +158,14 @@ static int CeedOperatorApplyAdd_Sycl_gen(CeedOperator op, CeedVector input_vec, CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); if (eval_mode == CEED_EVAL_WEIGHT) { // Skip } else { + bool is_active; CeedVector vec; CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); - if (vec == CEED_VECTOR_ACTIVE) vec = input_vec; + is_active = vec == CEED_VECTOR_ACTIVE; + if (is_active) vec = input_vec; CeedCallBackend(CeedVectorRestoreArrayRead(vec, &impl->fields->inputs[i])); + if (!is_active) CeedCallBackend(CeedVectorDestroy(&vec)); } } @@ -165,10 +174,12 @@ static int CeedOperatorApplyAdd_Sycl_gen(CeedOperator op, CeedVector input_vec, CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); if (eval_mode == CEED_EVAL_WEIGHT) { // Skip } else { + bool is_active; CeedVector vec; CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec)); - if (vec == CEED_VECTOR_ACTIVE) vec = output_vec; + is_active = vec == CEED_VECTOR_ACTIVE; + if (is_active) vec = output_vec; // Check for multiple output modes CeedInt index = -1; @@ -181,11 +192,14 @@ static int CeedOperatorApplyAdd_Sycl_gen(CeedOperator op, CeedVector input_vec, if (index == -1) { CeedCallBackend(CeedVectorRestoreArray(vec, &impl->fields->outputs[i])); } + if (!is_active) CeedCallBackend(CeedVectorDestroy(&vec)); } } // Restore context data CeedCallBackend(CeedQFunctionRestoreInnerContextData(qf, &qf_impl->d_c)); + CeedCallBackend(CeedDestroy(&ceed)); + CeedCallBackend(CeedQFunctionDestroy(&qf)); return CEED_ERROR_SUCCESS; } @@ -211,6 +225,7 @@ int CeedOperatorCreate_Sycl_gen(CeedOperator op) { CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Operator", op, "ApplyAdd", CeedOperatorApplyAdd_Sycl_gen)); CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Operator", op, "Destroy", CeedOperatorDestroy_Sycl_gen)); + CeedCallBackend(CeedDestroy(&ceed)); return CEED_ERROR_SUCCESS; } diff --git a/backends/sycl-gen/ceed-sycl-gen-qfunction.sycl.cpp b/backends/sycl-gen/ceed-sycl-gen-qfunction.sycl.cpp index 05774e6237..99d1438269 100644 --- a/backends/sycl-gen/ceed-sycl-gen-qfunction.sycl.cpp +++ b/backends/sycl-gen/ceed-sycl-gen-qfunction.sycl.cpp @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -38,6 +38,7 @@ static int CeedQFunctionDestroy_Sycl_gen(CeedQFunction qf) { CeedCallBackend(CeedFree(&impl->qfunction_source)); CeedCallBackend(CeedFree(&impl)); + CeedCallBackend(CeedDestroy(&ceed)); return CEED_ERROR_SUCCESS; } @@ -61,6 +62,7 @@ int CeedQFunctionCreate_Sycl_gen(CeedQFunction qf) { CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "QFunction", qf, "Apply", CeedQFunctionApply_Sycl_gen)); CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "QFunction", qf, "Destroy", CeedQFunctionDestroy_Sycl_gen)); + CeedCallBackend(CeedDestroy(&ceed)); return CEED_ERROR_SUCCESS; } diff --git a/backends/sycl-gen/ceed-sycl-gen.hpp b/backends/sycl-gen/ceed-sycl-gen.hpp index bc1179e4f2..5ba1836197 100644 --- a/backends/sycl-gen/ceed-sycl-gen.hpp +++ b/backends/sycl-gen/ceed-sycl-gen.hpp @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause diff --git a/backends/sycl-gen/ceed-sycl-gen.sycl.cpp b/backends/sycl-gen/ceed-sycl-gen.sycl.cpp index 3b274c8348..1d67da7e0a 100644 --- a/backends/sycl-gen/ceed-sycl-gen.sycl.cpp +++ b/backends/sycl-gen/ceed-sycl-gen.sycl.cpp @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -18,10 +18,9 @@ // Backend init //------------------------------------------------------------------------------ static int CeedInit_Sycl_gen(const char *resource, Ceed ceed) { - Ceed ceed_shared; - Ceed_Sycl *data, *shared_data; + Ceed ceed_shared, ceed_ref; + Ceed_Sycl *data; char *resource_root; - const char fallback_resource[] = "/gpu/sycl/ref"; CeedCallBackend(CeedGetResourceRoot(ceed, resource, ":device_id=", &resource_root)); CeedCheck(!strcmp(resource_root, "/gpu/sycl") || !strcmp(resource_root, "/gpu/sycl/gen"), ceed, CEED_ERROR_BACKEND, @@ -35,12 +34,12 @@ static int CeedInit_Sycl_gen(const char *resource, Ceed ceed) { CeedCallBackend(CeedInit("/gpu/sycl/shared", &ceed_shared)); CeedCallBackend(CeedSetDelegate(ceed, ceed_shared)); CeedCallBackend(CeedSetStream_Sycl(ceed_shared, &(data->sycl_queue))); + CeedCallBackend(CeedDestroy(&ceed_shared)); - CeedCallBackend(CeedSetOperatorFallbackResource(ceed, fallback_resource)); - - Ceed ceed_fallback = NULL; - CeedCallBackend(CeedGetOperatorFallbackCeed(ceed, &ceed_fallback)); - CeedCallBackend(CeedSetStream_Sycl(ceed_fallback, &(data->sycl_queue))); + CeedCallBackend(CeedInit("/gpu/sycl/ref", &ceed_ref)); + CeedCallBackend(CeedSetOperatorFallbackCeed(ceed, ceed_ref)); + CeedCallBackend(CeedSetStream_Sycl(ceed_ref, &(data->sycl_queue))); + CeedCallBackend(CeedDestroy(&ceed_ref)); CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Ceed", ceed, "QFunctionCreate", CeedQFunctionCreate_Sycl_gen)); CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Ceed", ceed, "OperatorCreate", CeedOperatorCreate_Sycl_gen)); diff --git a/backends/sycl-ref/ceed-sycl-ref-basis.sycl.cpp b/backends/sycl-ref/ceed-sycl-ref-basis.sycl.cpp index 54c01f0825..508830fffd 100644 --- a/backends/sycl-ref/ceed-sycl-ref-basis.sycl.cpp +++ b/backends/sycl-ref/ceed-sycl-ref-basis.sycl.cpp @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -312,6 +312,7 @@ static int CeedBasisApply_Sycl(CeedBasis basis, const CeedInt num_elem, CeedTran } break; case CEED_EVAL_WEIGHT: + CeedCheck(impl->d_q_weight_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weight_1d not set", CeedEvalModes[eval_mode]); CeedCallBackend(CeedBasisApplyWeight_Sycl(data->sycl_queue, num_elem, impl, d_v)); break; case CEED_EVAL_NONE: /* handled separately below */ @@ -327,6 +328,7 @@ static int CeedBasisApply_Sycl(CeedBasis basis, const CeedInt num_elem, CeedTran CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u)); if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); + CeedCallBackend(CeedDestroy(&ceed)); return CEED_ERROR_SUCCESS; } @@ -487,6 +489,7 @@ static int CeedBasisApplyNonTensor_Sycl(CeedBasis basis, const CeedInt num_elem, CeedCallBackend(CeedBasisApplyNonTensorGrad_Sycl(data->sycl_queue, num_elem, is_transpose, impl, d_u, d_v)); break; case CEED_EVAL_WEIGHT: + CeedCheck(impl->d_q_weight, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weights not set", CeedEvalModes[eval_mode]); CeedCallBackend(CeedBasisApplyNonTensorWeight_Sycl(data->sycl_queue, num_elem, impl, d_v)); break; case CEED_EVAL_NONE: /* handled separately below */ @@ -502,7 +505,7 @@ static int CeedBasisApplyNonTensor_Sycl(CeedBasis basis, const CeedInt num_elem, CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u)); if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); - + CeedCallBackend(CeedDestroy(&ceed)); return CEED_ERROR_SUCCESS; } @@ -520,11 +523,12 @@ static int CeedBasisDestroy_Sycl(CeedBasis basis) { // Wait for all work to finish before freeing memory CeedCallSycl(ceed, data->sycl_queue.wait_and_throw()); - CeedCallSycl(ceed, sycl::free(impl->d_q_weight_1d, data->sycl_context)); + if (impl->d_q_weight_1d) CeedCallSycl(ceed, sycl::free(impl->d_q_weight_1d, data->sycl_context)); CeedCallSycl(ceed, sycl::free(impl->d_interp_1d, data->sycl_context)); CeedCallSycl(ceed, sycl::free(impl->d_grad_1d, data->sycl_context)); CeedCallBackend(CeedFree(&impl)); + CeedCallBackend(CeedDestroy(&ceed)); return CEED_ERROR_SUCCESS; } @@ -542,11 +546,12 @@ static int CeedBasisDestroyNonTensor_Sycl(CeedBasis basis) { // Wait for all work to finish before freeing memory CeedCallSycl(ceed, data->sycl_queue.wait_and_throw()); - CeedCallSycl(ceed, sycl::free(impl->d_q_weight, data->sycl_context)); + if (impl->d_q_weight) CeedCallSycl(ceed, sycl::free(impl->d_q_weight, data->sycl_context)); CeedCallSycl(ceed, sycl::free(impl->d_interp, data->sycl_context)); CeedCallSycl(ceed, sycl::free(impl->d_grad, data->sycl_context)); CeedCallBackend(CeedFree(&impl)); + CeedCallBackend(CeedDestroy(&ceed)); return CEED_ERROR_SUCCESS; } @@ -555,11 +560,12 @@ static int CeedBasisDestroyNonTensor_Sycl(CeedBasis basis) { //------------------------------------------------------------------------------ int CeedBasisCreateTensorH1_Sycl(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d, const CeedScalar *q_ref_1d, const CeedScalar *q_weight_1d, CeedBasis basis) { - Ceed ceed; - CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); + Ceed ceed; CeedBasis_Sycl *impl; + Ceed_Sycl *data; + + CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); CeedCallBackend(CeedCalloc(1, &impl)); - Ceed_Sycl *data; CeedCallBackend(CeedGetData(ceed, &data)); CeedInt num_comp; @@ -581,17 +587,23 @@ int CeedBasisCreateTensorH1_Sycl(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const if (!data->sycl_queue.is_in_order()) e = {data->sycl_queue.ext_oneapi_submit_barrier()}; - CeedCallSycl(ceed, impl->d_q_weight_1d = sycl::malloc_device(Q_1d, data->sycl_device, data->sycl_context)); - sycl::event copy_weight = data->sycl_queue.copy(q_weight_1d, impl->d_q_weight_1d, Q_1d, e); + std::vector copy_events; + if (q_weight_1d) { + CeedCallSycl(ceed, impl->d_q_weight_1d = sycl::malloc_device(Q_1d, data->sycl_device, data->sycl_context)); + sycl::event copy_weight = data->sycl_queue.copy(q_weight_1d, impl->d_q_weight_1d, Q_1d, e); + copy_events.push_back(copy_weight); + } const CeedInt interp_length = Q_1d * P_1d; CeedCallSycl(ceed, impl->d_interp_1d = sycl::malloc_device(interp_length, data->sycl_device, data->sycl_context)); sycl::event copy_interp = data->sycl_queue.copy(interp_1d, impl->d_interp_1d, interp_length, e); + copy_events.push_back(copy_interp); CeedCallSycl(ceed, impl->d_grad_1d = sycl::malloc_device(interp_length, data->sycl_device, data->sycl_context)); sycl::event copy_grad = data->sycl_queue.copy(grad_1d, impl->d_grad_1d, interp_length, e); + copy_events.push_back(copy_grad); - CeedCallSycl(ceed, sycl::event::wait_and_throw({copy_weight, copy_interp, copy_grad})); + CeedCallSycl(ceed, sycl::event::wait_and_throw(copy_events)); std::vector kernel_ids = {sycl::get_kernel_id>(), sycl::get_kernel_id>(), sycl::get_kernel_id>(), sycl::get_kernel_id>()}; @@ -609,6 +621,7 @@ int CeedBasisCreateTensorH1_Sycl(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const // Register backend functions CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Basis", basis, "Apply", CeedBasisApply_Sycl)); CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Sycl)); + CeedCallBackend(CeedDestroy(&ceed)); return CEED_ERROR_SUCCESS; } @@ -617,11 +630,12 @@ int CeedBasisCreateTensorH1_Sycl(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const //------------------------------------------------------------------------------ int CeedBasisCreateH1_Sycl(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, const CeedScalar *grad, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) { - Ceed ceed; - CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); + Ceed ceed; CeedBasisNonTensor_Sycl *impl; + Ceed_Sycl *data; + + CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); CeedCallBackend(CeedCalloc(1, &impl)); - Ceed_Sycl *data; CeedCallBackend(CeedGetData(ceed, &data)); CeedInt num_comp; @@ -636,24 +650,31 @@ int CeedBasisCreateH1_Sycl(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes if (!data->sycl_queue.is_in_order()) e = {data->sycl_queue.ext_oneapi_submit_barrier()}; - CeedCallSycl(ceed, impl->d_q_weight = sycl::malloc_device(num_qpts, data->sycl_device, data->sycl_context)); - sycl::event copy_weight = data->sycl_queue.copy(q_weight, impl->d_q_weight, num_qpts, e); + std::vector copy_events; + if (q_weight) { + CeedCallSycl(ceed, impl->d_q_weight = sycl::malloc_device(num_qpts, data->sycl_device, data->sycl_context)); + sycl::event copy_weight = data->sycl_queue.copy(q_weight, impl->d_q_weight, num_qpts, e); + copy_events.push_back(copy_weight); + } const CeedInt interp_length = num_qpts * num_nodes; CeedCallSycl(ceed, impl->d_interp = sycl::malloc_device(interp_length, data->sycl_device, data->sycl_context)); sycl::event copy_interp = data->sycl_queue.copy(interp, impl->d_interp, interp_length, e); + copy_events.push_back(copy_interp); const CeedInt grad_length = num_qpts * num_nodes * dim; CeedCallSycl(ceed, impl->d_grad = sycl::malloc_device(grad_length, data->sycl_device, data->sycl_context)); sycl::event copy_grad = data->sycl_queue.copy(grad, impl->d_grad, grad_length, e); + copy_events.push_back(copy_grad); - CeedCallSycl(ceed, sycl::event::wait_and_throw({copy_weight, copy_interp, copy_grad})); + CeedCallSycl(ceed, sycl::event::wait_and_throw(copy_events)); CeedCallBackend(CeedBasisSetData(basis, impl)); // Register backend functions CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Sycl)); CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Sycl)); + CeedCallBackend(CeedDestroy(&ceed)); return CEED_ERROR_SUCCESS; } diff --git a/backends/sycl-ref/ceed-sycl-ref-operator.sycl.cpp b/backends/sycl-ref/ceed-sycl-ref-operator.sycl.cpp index f3cf95641a..45cef53918 100644 --- a/backends/sycl-ref/ceed-sycl-ref-operator.sycl.cpp +++ b/backends/sycl-ref/ceed-sycl-ref-operator.sycl.cpp @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other // CEED contributors. All Rights Reserved. See the top-level LICENSE and NOTICE // files for details. // @@ -23,9 +23,9 @@ class CeedOperatorSyclLinearAssembleFallback; //------------------------------------------------------------------------------ // Get Basis Emode Pointer //------------------------------------------------------------------------------ -void CeedOperatorGetBasisPointer_Sycl(const CeedScalar **basis_ptr, CeedEvalMode e_mode, const CeedScalar *identity, const CeedScalar *interp, +void CeedOperatorGetBasisPointer_Sycl(const CeedScalar **basis_ptr, CeedEvalMode eval_mode, const CeedScalar *identity, const CeedScalar *interp, const CeedScalar *grad) { - switch (e_mode) { + switch (eval_mode) { case CEED_EVAL_NONE: *basis_ptr = identity; break; @@ -78,21 +78,24 @@ static int CeedOperatorDestroy_Sycl(CeedOperator op) { // Diag data if (impl->diag) { - CeedCallBackend(CeedFree(&impl->diag->h_e_mode_in)); - CeedCallBackend(CeedFree(&impl->diag->h_e_mode_out)); + CeedCallBackend(CeedFree(&impl->diag->h_eval_mode_in)); + CeedCallBackend(CeedFree(&impl->diag->h_eval_mode_out)); CeedCallSycl(ceed, sycl_data->sycl_queue.wait_and_throw()); - CeedCallSycl(ceed, sycl::free(impl->diag->d_e_mode_in, sycl_data->sycl_context)); - CeedCallSycl(ceed, sycl::free(impl->diag->d_e_mode_out, sycl_data->sycl_context)); + CeedCallSycl(ceed, sycl::free(impl->diag->d_eval_mode_in, sycl_data->sycl_context)); + CeedCallSycl(ceed, sycl::free(impl->diag->d_eval_mode_out, sycl_data->sycl_context)); CeedCallSycl(ceed, sycl::free(impl->diag->d_identity, sycl_data->sycl_context)); CeedCallSycl(ceed, sycl::free(impl->diag->d_interp_in, sycl_data->sycl_context)); CeedCallSycl(ceed, sycl::free(impl->diag->d_interp_out, sycl_data->sycl_context)); CeedCallSycl(ceed, sycl::free(impl->diag->d_grad_in, sycl_data->sycl_context)); CeedCallSycl(ceed, sycl::free(impl->diag->d_grad_out, sycl_data->sycl_context)); - CeedCallBackend(CeedElemRestrictionDestroy(&impl->diag->point_block_diag_rstr)); CeedCallBackend(CeedVectorDestroy(&impl->diag->elem_diag)); CeedCallBackend(CeedVectorDestroy(&impl->diag->point_block_elem_diag)); + CeedCallBackend(CeedElemRestrictionDestroy(&impl->diag->diag_rstr)); + CeedCallBackend(CeedElemRestrictionDestroy(&impl->diag->point_block_diag_rstr)); + CeedCallBackend(CeedBasisDestroy(&impl->diag->basis_in)); + CeedCallBackend(CeedBasisDestroy(&impl->diag->basis_out)); } CeedCallBackend(CeedFree(&impl->diag)); @@ -104,6 +107,7 @@ static int CeedOperatorDestroy_Sycl(CeedOperator op) { CeedCallBackend(CeedFree(&impl->asmb)); CeedCallBackend(CeedFree(&impl)); + CeedCallBackend(CeedDestroy(&ceed)); return CEED_ERROR_SUCCESS; } @@ -115,7 +119,7 @@ static int CeedOperatorSetupFields_Sycl(CeedQFunction qf, CeedOperator op, bool Ceed ceed; CeedSize q_size; bool is_strided, skip_restriction; - CeedInt dim, size; + CeedInt size; CeedOperatorField *op_fields; CeedQFunctionField *qf_fields; @@ -130,46 +134,47 @@ static int CeedOperatorSetupFields_Sycl(CeedQFunction qf, CeedOperator op, bool // Loop over fields for (CeedInt i = 0; i < num_fields; i++) { - CeedEvalMode e_mode; + CeedEvalMode eval_mode; CeedVector vec; - CeedElemRestriction rstr; - CeedBasis basis; + CeedElemRestriction elem_rstr; - CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &e_mode)); + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode)); is_strided = false; skip_restriction = false; - if (e_mode != CEED_EVAL_WEIGHT) { - CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &rstr)); + if (eval_mode != CEED_EVAL_WEIGHT) { + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &elem_rstr)); // Check whether this field can skip the element restriction: - // must be passive input, with e_mode NONE, and have a strided restriction with CEED_STRIDES_BACKEND. + // must be passive input, with eval_mode NONE, and have a strided restriction with CEED_STRIDES_BACKEND. // First, check whether the field is input or output: if (is_input) { // Check for passive input: CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec)); if (vec != CEED_VECTOR_ACTIVE) { - // Check e_mode - if (e_mode == CEED_EVAL_NONE) { + // Check eval_mode + if (eval_mode == CEED_EVAL_NONE) { // Check for is_strided restriction - CeedCallBackend(CeedElemRestrictionIsStrided(rstr, &is_strided)); + CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided)); if (is_strided) { // Check if vector is already in preferred backend ordering - CeedCallBackend(CeedElemRestrictionHasBackendStrides(rstr, &skip_restriction)); + CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &skip_restriction)); } } } + CeedCallBackend(CeedVectorDestroy(&vec)); } if (skip_restriction) { // We do not need an E-Vector, but will use the input field vector's data directly in the operator application e_vecs[i + start_e] = NULL; } else { - CeedCallBackend(CeedElemRestrictionCreateVector(rstr, NULL, &e_vecs[i + start_e])); + CeedCallBackend(CeedElemRestrictionCreateVector(elem_rstr, NULL, &e_vecs[i + start_e])); } + CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); } - switch (e_mode) { + switch (eval_mode) { case CEED_EVAL_NONE: CeedCallBackend(CeedQFunctionFieldGetSize(qf_fields[i], &size)); q_size = (CeedSize)num_elem * Q * size; @@ -181,24 +186,28 @@ static int CeedOperatorSetupFields_Sycl(CeedQFunction qf, CeedOperator op, bool CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i])); break; case CEED_EVAL_GRAD: - CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis)); CeedCallBackend(CeedQFunctionFieldGetSize(qf_fields[i], &size)); - CeedCallBackend(CeedBasisGetDimension(basis, &dim)); q_size = (CeedSize)num_elem * Q * size; CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i])); break; - case CEED_EVAL_WEIGHT: // Only on input fields - CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis)); + case CEED_EVAL_WEIGHT: { + CeedBasis basis; + + // Note: only on input fields q_size = (CeedSize)num_elem * Q; CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i])); + CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis)); CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_NOTRANSPOSE, CEED_EVAL_WEIGHT, CEED_VECTOR_NONE, q_vecs[i])); + CeedCallBackend(CeedBasisDestroy(&basis)); break; + } case CEED_EVAL_DIV: break; // TODO: Not implemented case CEED_EVAL_CURL: break; // TODO: Not implemented } } + CeedCallBackend(CeedDestroy(&ceed)); return CEED_ERROR_SUCCESS; } @@ -207,7 +216,6 @@ static int CeedOperatorSetupFields_Sycl(CeedQFunction qf, CeedOperator op, bool // passive) to the named inputs and outputs of its CeedQFunction. //------------------------------------------------------------------------------ static int CeedOperatorSetup_Sycl(CeedOperator op) { - Ceed ceed; bool is_setup_done; CeedInt Q, num_elem, num_input_fields, num_output_fields; CeedQFunctionField *qf_input_fields, *qf_output_fields; @@ -218,7 +226,6 @@ static int CeedOperatorSetup_Sycl(CeedOperator op) { CeedCallBackend(CeedOperatorIsSetupDone(op, &is_setup_done)); if (is_setup_done) return CEED_ERROR_SUCCESS; - CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); CeedCallBackend(CeedOperatorGetData(op, &impl)); CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q)); @@ -242,6 +249,7 @@ static int CeedOperatorSetup_Sycl(CeedOperator op) { CeedCallBackend(CeedOperatorSetupFields_Sycl(qf, op, false, impl->e_vecs, impl->q_vecs_out, num_input_fields, num_output_fields, Q, num_elem)); CeedCallBackend(CeedOperatorSetSetupDone(op)); + CeedCallBackend(CeedQFunctionDestroy(&qf)); return CEED_ERROR_SUCCESS; } @@ -252,35 +260,35 @@ static inline int CeedOperatorSetupInputs_Sycl(CeedInt num_input_fields, CeedQFu CeedVector in_vec, const bool skip_active, CeedScalar *e_data[2 * CEED_FIELD_MAX], CeedOperator_Sycl *impl, CeedRequest *request) { for (CeedInt i = 0; i < num_input_fields; i++) { - CeedEvalMode e_mode; - CeedVector vec; - CeedElemRestriction rstr; + bool is_active; + CeedEvalMode eval_mode; + CeedVector vec; // Get input vector CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); - if (vec == CEED_VECTOR_ACTIVE) { + is_active = vec == CEED_VECTOR_ACTIVE; + if (is_active) { if (skip_active) continue; else vec = in_vec; } - CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &e_mode)); - if (e_mode == CEED_EVAL_WEIGHT) { // Skip + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); + if (eval_mode == CEED_EVAL_WEIGHT) { // Skip } else { - // Get input vector - CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); - // Get input element restriction - CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr)); - if (vec == CEED_VECTOR_ACTIVE) vec = in_vec; // Restrict, if necessary if (!impl->e_vecs[i]) { // No restriction for this field; read data directly from vec. CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, (const CeedScalar **)&e_data[i])); } else { - CeedCallBackend(CeedElemRestrictionApply(rstr, CEED_NOTRANSPOSE, vec, impl->e_vecs[i], request)); - // Get evec + CeedElemRestriction elem_rstr; + + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr)); + CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_NOTRANSPOSE, vec, impl->e_vecs[i], request)); + CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); CeedCallBackend(CeedVectorGetArrayRead(impl->e_vecs[i], CEED_MEM_DEVICE, (const CeedScalar **)&e_data[i])); } } + if (!is_active) CeedCallBackend(CeedVectorDestroy(&vec)); } return CEED_ERROR_SUCCESS; } @@ -292,36 +300,33 @@ static inline int CeedOperatorInputBasis_Sycl(CeedInt num_elem, CeedQFunctionFie CeedInt num_input_fields, const bool skip_active, CeedScalar *e_data[2 * CEED_FIELD_MAX], CeedOperator_Sycl *impl) { for (CeedInt i = 0; i < num_input_fields; i++) { - CeedInt elem_size, size; - CeedElemRestriction rstr; - CeedEvalMode e_mode; - CeedBasis basis; + CeedEvalMode eval_mode; // Skip active input if (skip_active) { + bool is_active; CeedVector vec; CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); - if (vec == CEED_VECTOR_ACTIVE) continue; + is_active = vec == CEED_VECTOR_ACTIVE; + CeedCallBackend(CeedVectorDestroy(&vec)); + if (is_active) continue; } - // Get elem_size, e_mode, size - CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr)); - CeedCallBackend(CeedElemRestrictionGetElementSize(rstr, &elem_size)); - CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &e_mode)); - CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &size)); // Basis action - switch (e_mode) { + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); + switch (eval_mode) { case CEED_EVAL_NONE: CeedCallBackend(CeedVectorSetArray(impl->q_vecs_in[i], CEED_MEM_DEVICE, CEED_USE_POINTER, e_data[i])); break; case CEED_EVAL_INTERP: + case CEED_EVAL_GRAD: { + CeedBasis basis; + CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); - CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_NOTRANSPOSE, CEED_EVAL_INTERP, impl->e_vecs[i], impl->q_vecs_in[i])); - break; - case CEED_EVAL_GRAD: - CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); - CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_NOTRANSPOSE, CEED_EVAL_GRAD, impl->e_vecs[i], impl->q_vecs_in[i])); + CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_NOTRANSPOSE, eval_mode, impl->e_vecs[i], impl->q_vecs_in[i])); + CeedCallBackend(CeedBasisDestroy(&basis)); break; + } case CEED_EVAL_WEIGHT: break; // No action case CEED_EVAL_DIV: @@ -339,24 +344,26 @@ static inline int CeedOperatorInputBasis_Sycl(CeedInt num_elem, CeedQFunctionFie static inline int CeedOperatorRestoreInputs_Sycl(CeedInt num_input_fields, CeedQFunctionField *qf_input_fields, CeedOperatorField *op_input_fields, const bool skip_active, CeedScalar *e_data[2 * CEED_FIELD_MAX], CeedOperator_Sycl *impl) { for (CeedInt i = 0; i < num_input_fields; i++) { - CeedEvalMode e_mode; + bool is_active; + CeedEvalMode eval_mode; CeedVector vec; + CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); + is_active = vec == CEED_VECTOR_ACTIVE; // Skip active input if (skip_active) { - CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); - if (vec == CEED_VECTOR_ACTIVE) continue; + if (is_active) continue; } - CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &e_mode)); - if (e_mode == CEED_EVAL_WEIGHT) { // Skip + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); + if (eval_mode == CEED_EVAL_WEIGHT) { // Skip } else { if (!impl->e_vecs[i]) { // This was a skip_restriction case - CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); CeedCallBackend(CeedVectorRestoreArrayRead(vec, (const CeedScalar **)&e_data[i])); } else { CeedCallBackend(CeedVectorRestoreArrayRead(impl->e_vecs[i], (const CeedScalar **)&e_data[i])); } } + if (!is_active) CeedCallBackend(CeedVectorDestroy(&vec)); } return CEED_ERROR_SUCCESS; } @@ -404,12 +411,12 @@ static int CeedOperatorApplyAdd_Sycl(CeedOperator op, CeedVector in_vec, CeedVec // Output basis apply if needed for (CeedInt i = 0; i < num_output_fields; i++) { - CeedElemRestriction rstr; - CeedBasis basis; + CeedElemRestriction elem_rstr; - // Get elem_size, eval_mode, size - CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &rstr)); - CeedCallBackend(CeedElemRestrictionGetElementSize(rstr, &elem_size)); + // Get elem_size, eval_mode, size + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr)); + CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); + CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &size)); // Basis action @@ -417,63 +424,60 @@ static int CeedOperatorApplyAdd_Sycl(CeedOperator op, CeedVector in_vec, CeedVec case CEED_EVAL_NONE: break; case CEED_EVAL_INTERP: + case CEED_EVAL_GRAD: { + CeedBasis basis; + CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); - CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_TRANSPOSE, CEED_EVAL_INTERP, impl->q_vecs_out[i], impl->e_vecs[i + impl->num_e_in])); - break; - case CEED_EVAL_GRAD: - CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); - CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_TRANSPOSE, CEED_EVAL_GRAD, impl->q_vecs_out[i], impl->e_vecs[i + impl->num_e_in])); + CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_TRANSPOSE, eval_mode, impl->q_vecs_out[i], impl->e_vecs[i + impl->num_e_in])); + CeedCallBackend(CeedBasisDestroy(&basis)); break; + } // LCOV_EXCL_START case CEED_EVAL_WEIGHT: - Ceed ceed; - - CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); - return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); + return CeedError(CeedOperatorReturnCeed(op), CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); break; // Should not occur case CEED_EVAL_DIV: - case CEED_EVAL_CURL: { - Ceed ceed; - - CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); - return CeedError(ceed, CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]); + case CEED_EVAL_CURL: + return CeedError(CeedOperatorReturnCeed(op), CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]); break; // Should not occur - } - // LCOV_EXCL_STOP + // LCOV_EXCL_STOP } } // Output restriction for (CeedInt i = 0; i < num_output_fields; i++) { + bool is_active; + CeedEvalMode eval_mode; CeedVector vec; - CeedElemRestriction rstr; + CeedElemRestriction elem_rstr; // Restore evec CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); if (eval_mode == CEED_EVAL_NONE) { CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs[i + impl->num_e_in], &e_data[i + num_input_fields])); } - // Get output vector - CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec)); // Restrict - CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &rstr)); - // Active - if (vec == CEED_VECTOR_ACTIVE) vec = out_vec; - - CeedCallBackend(CeedElemRestrictionApply(rstr, CEED_TRANSPOSE, impl->e_vecs[i + impl->num_e_in], vec, request)); + CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec)); + is_active = vec == CEED_VECTOR_ACTIVE; + if (is_active) vec = out_vec; + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr)); + CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_TRANSPOSE, impl->e_vecs[i + impl->num_e_in], vec, request)); + if (!is_active) CeedCallBackend(CeedVectorDestroy(&vec)); + CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); } // Restore input arrays CeedCallBackend(CeedOperatorRestoreInputs_Sycl(num_input_fields, qf_input_fields, op_input_fields, false, e_data, impl)); + CeedCallBackend(CeedQFunctionDestroy(&qf)); return CEED_ERROR_SUCCESS; } //------------------------------------------------------------------------------ // Core code for assembling linear QFunction //------------------------------------------------------------------------------ -static inline int CeedOperatorLinearAssembleQFunctionCore_Sycl(CeedOperator op, bool build_objects, CeedVector *assembled, CeedElemRestriction *rstr, - CeedRequest *request) { - Ceed ceed, ceed_parent; +static inline int CeedOperatorLinearAssembleQFunctionCore_Sycl(CeedOperator op, bool build_objects, CeedVector *assembled, + CeedElemRestriction *elem_rstr, CeedRequest *request) { + Ceed ceed_parent; CeedSize q_size; CeedInt num_active_in, num_active_out, Q, num_elem, num_input_fields, num_output_fields, size; CeedScalar *assembled_array, *e_data[2 * CEED_FIELD_MAX] = {NULL}; @@ -483,7 +487,6 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Sycl(CeedOperator op, CeedOperatorField *op_input_fields, *op_output_fields; CeedOperator_Sycl *impl; - CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); CeedCallBackend(CeedOperatorGetFallbackParentCeed(op, &ceed_parent)); CeedCallBackend(CeedOperatorGetData(op, &impl)); CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); @@ -506,9 +509,8 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Sycl(CeedOperator op, CeedScalar *q_vec_array; CeedVector vec; - // Get input vector - CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); // Check if active input + CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); if (vec == CEED_VECTOR_ACTIVE) { CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &size)); CeedCallBackend(CeedVectorSetValue(impl->q_vecs_in[i], 0.0)); @@ -516,13 +518,14 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Sycl(CeedOperator op, CeedCallBackend(CeedRealloc(num_active_in + size, &active_in)); for (CeedInt field = 0; field < size; field++) { q_size = (CeedSize)Q * num_elem; - CeedCallBackend(CeedVectorCreate(ceed, q_size, &active_in[num_active_in + field])); - CeedCallBackend( - CeedVectorSetArray(active_in[num_active_in + field], CEED_MEM_DEVICE, CEED_USE_POINTER, &q_vec_array[field * Q * num_elem])); + CeedCallBackend(CeedVectorCreate(ceed_parent, q_size, &active_in[num_active_in + field])); + CeedCallBackend(CeedVectorSetArray(active_in[num_active_in + field], CEED_MEM_DEVICE, CEED_USE_POINTER, + &q_vec_array[field * Q * num_elem])); } num_active_in += size; CeedCallBackend(CeedVectorRestoreArray(impl->q_vecs_in[i], &q_vec_array)); } + CeedCallBackend(CeedVectorDestroy(&vec)); } impl->num_active_in = num_active_in; impl->qf_active_in = active_in; @@ -533,19 +536,20 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Sycl(CeedOperator op, for (CeedInt i = 0; i < num_output_fields; i++) { CeedVector vec; - // Get output vector - CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec)); // Check if active output + CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec)); if (vec == CEED_VECTOR_ACTIVE) { CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &size)); num_active_out += size; } + CeedCallBackend(CeedVectorDestroy(&vec)); } impl->num_active_out = num_active_out; } // Check sizes - CeedCheck(num_active_in > 0 && num_active_out > 0, ceed, CEED_ERROR_BACKEND, "Cannot assemble QFunction without active inputs and outputs"); + CeedCheck(num_active_in > 0 && num_active_out > 0, CeedOperatorReturnCeed(op), CEED_ERROR_BACKEND, + "Cannot assemble QFunction without active inputs and outputs"); // Build objects if needed if (build_objects) { @@ -553,8 +557,7 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Sycl(CeedOperator op, CeedInt strides[3] = {1, num_elem * Q, Q}; /* *NOPAD* */ // Create output restriction - CeedCallBackend(CeedElemRestrictionCreateStrided(ceed_parent, num_elem, Q, num_active_in * num_active_out, - num_active_in * num_active_out * num_elem * Q, strides, rstr)); + CeedCallBackend(CeedElemRestrictionCreateStrided(ceed_parent, num_elem, Q, num_active_in * num_active_out, l_size, strides, elem_rstr)); // Create assembled vector CeedCallBackend(CeedVectorCreate(ceed_parent, l_size, assembled)); } @@ -575,14 +578,14 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Sycl(CeedOperator op, for (CeedInt out = 0; out < num_output_fields; out++) { CeedVector vec; - // Get output vector - CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[out], &vec)); // Check if active output + CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[out], &vec)); if (vec == CEED_VECTOR_ACTIVE) { CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[out], CEED_MEM_DEVICE, CEED_USE_POINTER, assembled_array)); CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[out], &size)); assembled_array += size * Q * num_elem; // Advance the pointer by the size of the output } + CeedCallBackend(CeedVectorDestroy(&vec)); } // Apply QFunction CeedCallBackend(CeedQFunctionApply(qf, Q * num_elem, impl->q_vecs_in, impl->q_vecs_out)); @@ -592,12 +595,12 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Sycl(CeedOperator op, for (CeedInt out = 0; out < num_output_fields; out++) { CeedVector vec; - // Get output vector - CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[out], &vec)); // Check if active output + CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[out], &vec)); if (vec == CEED_VECTOR_ACTIVE) { CeedCallBackend(CeedVectorTakeArray(impl->q_vecs_out[out], CEED_MEM_DEVICE, NULL)); } + CeedCallBackend(CeedVectorDestroy(&vec)); } // Restore input arrays @@ -605,21 +608,24 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Sycl(CeedOperator op, // Restore output CeedCallBackend(CeedVectorRestoreArray(*assembled, &assembled_array)); + CeedCallBackend(CeedDestroy(&ceed_parent)); + CeedCallBackend(CeedQFunctionDestroy(&qf)); return CEED_ERROR_SUCCESS; } //------------------------------------------------------------------------------ // Assemble Linear QFunction //------------------------------------------------------------------------------ -static int CeedOperatorLinearAssembleQFunction_Sycl(CeedOperator op, CeedVector *assembled, CeedElemRestriction *rstr, CeedRequest *request) { - return CeedOperatorLinearAssembleQFunctionCore_Sycl(op, true, assembled, rstr, request); +static int CeedOperatorLinearAssembleQFunction_Sycl(CeedOperator op, CeedVector *assembled, CeedElemRestriction *elem_rstr, CeedRequest *request) { + return CeedOperatorLinearAssembleQFunctionCore_Sycl(op, true, assembled, elem_rstr, request); } //------------------------------------------------------------------------------ // Update Assembled Linear QFunction //------------------------------------------------------------------------------ -static int CeedOperatorLinearAssembleQFunctionUpdate_Sycl(CeedOperator op, CeedVector assembled, CeedElemRestriction rstr, CeedRequest *request) { - return CeedOperatorLinearAssembleQFunctionCore_Sycl(op, false, &assembled, &rstr, request); +static int CeedOperatorLinearAssembleQFunctionUpdate_Sycl(CeedOperator op, CeedVector assembled, CeedElemRestriction elem_rstr, + CeedRequest *request) { + return CeedOperatorLinearAssembleQFunctionCore_Sycl(op, false, &assembled, &elem_rstr, request); } //------------------------------------------------------------------------------ @@ -628,10 +634,10 @@ static int CeedOperatorLinearAssembleQFunctionUpdate_Sycl(CeedOperator op, CeedV static inline int CeedOperatorAssembleDiagonalSetup_Sycl(CeedOperator op) { Ceed ceed; Ceed_Sycl *sycl_data; - CeedInt num_input_fields, num_output_fields, num_e_mode_in = 0, num_comp = 0, dim = 1, num_e_mode_out = 0; - CeedEvalMode *e_mode_in = NULL, *e_mode_out = NULL; - CeedBasis basis_in = NULL, basis_out = NULL; + CeedInt num_input_fields, num_output_fields, num_eval_mode_in = 0, num_comp = 0, dim = 1, num_eval_mode_out = 0; + CeedEvalMode *eval_mode_in = NULL, *eval_mode_out = NULL; CeedElemRestriction rstr_in = NULL, rstr_out = NULL; + CeedBasis basis_in = NULL, basis_out = NULL; CeedQFunctionField *qf_fields; CeedQFunction qf; CeedOperatorField *op_fields; @@ -649,28 +655,31 @@ static inline int CeedOperatorAssembleDiagonalSetup_Sycl(CeedOperator op) { CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec)); if (vec == CEED_VECTOR_ACTIVE) { - CeedEvalMode e_mode; - CeedElemRestriction rstr; - - CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis_in)); - CeedCallBackend(CeedBasisGetNumComponents(basis_in, &num_comp)); + CeedEvalMode eval_mode; + CeedElemRestriction elem_rstr; + CeedBasis basis; + + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &elem_rstr)); + if (!rstr_in) CeedCallBackend(CeedElemRestrictionReferenceCopy(elem_rstr, &rstr_in)); + CeedCheck(rstr_in == elem_rstr, ceed, CEED_ERROR_BACKEND, "Backend does not implement multi-field non-composite operator diagonal assembly"); + CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); + CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis)); + if (!basis_in) CeedCallBackend(CeedBasisReferenceCopy(basis, &basis_in)); + CeedCheck(basis_in == basis, ceed, CEED_ERROR_BACKEND, "Backend does not implement operator diagonal assembly with multiple active bases"); + CeedCallBackend(CeedBasisDestroy(&basis)); CeedCallBackend(CeedBasisGetDimension(basis_in, &dim)); - CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &rstr)); - CeedCheck(!rstr_in || rstr_in == rstr, ceed, CEED_ERROR_BACKEND, - "Backend does not implement multi-field non-composite operator diagonal assembly"); - rstr_in = rstr; - CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &e_mode)); - switch (e_mode) { + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode)); + switch (eval_mode) { case CEED_EVAL_NONE: case CEED_EVAL_INTERP: - CeedCallBackend(CeedRealloc(num_e_mode_in + 1, &e_mode_in)); - e_mode_in[num_e_mode_in] = e_mode; - num_e_mode_in += 1; + CeedCallBackend(CeedRealloc(num_eval_mode_in + 1, &eval_mode_in)); + eval_mode_in[num_eval_mode_in] = eval_mode; + num_eval_mode_in += 1; break; case CEED_EVAL_GRAD: - CeedCallBackend(CeedRealloc(num_e_mode_in + dim, &e_mode_in)); - for (CeedInt d = 0; d < dim; d++) e_mode_in[num_e_mode_in + d] = e_mode; - num_e_mode_in += dim; + CeedCallBackend(CeedRealloc(num_eval_mode_in + dim, &eval_mode_in)); + for (CeedInt d = 0; d < dim; d++) eval_mode_in[num_eval_mode_in + d] = eval_mode; + num_eval_mode_in += dim; break; case CEED_EVAL_WEIGHT: case CEED_EVAL_DIV: @@ -678,7 +687,9 @@ static inline int CeedOperatorAssembleDiagonalSetup_Sycl(CeedOperator op) { break; // Caught by QF Assembly } } + CeedCallBackend(CeedVectorDestroy(&vec)); } + CeedCallBackend(CeedElemRestrictionDestroy(&rstr_in)); // Determine active output basis CeedCallBackend(CeedOperatorGetFields(op, NULL, NULL, NULL, &op_fields)); @@ -688,26 +699,30 @@ static inline int CeedOperatorAssembleDiagonalSetup_Sycl(CeedOperator op) { CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec)); if (vec == CEED_VECTOR_ACTIVE) { - CeedEvalMode e_mode; - CeedElemRestriction rstr; - - CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis_out)); - CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &rstr)); - CeedCheck(!rstr_out || rstr_out == rstr, ceed, CEED_ERROR_BACKEND, - "Backend does not implement multi-field non-composite operator diagonal assembly"); - rstr_out = rstr; - CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &e_mode)); - switch (e_mode) { + CeedEvalMode eval_mode; + CeedElemRestriction elem_rstr; + CeedBasis basis; + + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &elem_rstr)); + if (!rstr_out) CeedCallBackend(CeedElemRestrictionReferenceCopy(elem_rstr, &rstr_out)); + CeedCheck(rstr_out == elem_rstr, ceed, CEED_ERROR_BACKEND, "Backend does not implement multi-field non-composite operator diagonal assembly"); + CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); + CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis)); + if (!basis_out) CeedCallBackend(CeedBasisReferenceCopy(basis, &basis_out)); + CeedCheck(basis_out == basis, ceed, CEED_ERROR_BACKEND, "Backend does not implement operator diagonal assembly with multiple active bases"); + CeedCallBackend(CeedBasisDestroy(&basis)); + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode)); + switch (eval_mode) { case CEED_EVAL_NONE: case CEED_EVAL_INTERP: - CeedCallBackend(CeedRealloc(num_e_mode_out + 1, &e_mode_out)); - e_mode_out[num_e_mode_out] = e_mode; - num_e_mode_out += 1; + CeedCallBackend(CeedRealloc(num_eval_mode_out + 1, &eval_mode_out)); + eval_mode_out[num_eval_mode_out] = eval_mode; + num_eval_mode_out += 1; break; case CEED_EVAL_GRAD: - CeedCallBackend(CeedRealloc(num_e_mode_out + dim, &e_mode_out)); - for (CeedInt d = 0; d < dim; d++) e_mode_out[num_e_mode_out + d] = e_mode; - num_e_mode_out += dim; + CeedCallBackend(CeedRealloc(num_eval_mode_out + dim, &eval_mode_out)); + for (CeedInt d = 0; d < dim; d++) eval_mode_out[num_eval_mode_out + d] = eval_mode; + num_eval_mode_out += dim; break; case CEED_EVAL_WEIGHT: case CEED_EVAL_DIV: @@ -715,6 +730,7 @@ static inline int CeedOperatorAssembleDiagonalSetup_Sycl(CeedOperator op) { break; // Caught by QF Assembly } } + CeedCallBackend(CeedVectorDestroy(&vec)); } // Operator data struct @@ -723,17 +739,18 @@ static inline int CeedOperatorAssembleDiagonalSetup_Sycl(CeedOperator op) { CeedCallBackend(CeedCalloc(1, &impl->diag)); CeedOperatorDiag_Sycl *diag = impl->diag; - diag->basis_in = basis_in; - diag->basis_out = basis_out; - diag->h_e_mode_in = e_mode_in; - diag->h_e_mode_out = e_mode_out; - diag->num_e_mode_in = num_e_mode_in; - diag->num_e_mode_out = num_e_mode_out; + CeedCallBackend(CeedBasisReferenceCopy(basis_in, &diag->basis_in)); + CeedCallBackend(CeedBasisReferenceCopy(basis_out, &diag->basis_out)); + diag->h_eval_mode_in = eval_mode_in; + diag->h_eval_mode_out = eval_mode_out; + diag->num_eval_mode_in = num_eval_mode_in; + diag->num_eval_mode_out = num_eval_mode_out; // Kernel parameters CeedInt num_nodes, num_qpts; CeedCallBackend(CeedBasisGetNumNodes(basis_in, &num_nodes)); CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis_in, &num_qpts)); + CeedCallBackend(CeedBasisGetNumComponents(basis_in, &num_comp)); diag->num_nodes = num_nodes; diag->num_qpts = num_qpts; diag->num_comp = num_comp; @@ -746,8 +763,8 @@ static inline int CeedOperatorAssembleDiagonalSetup_Sycl(CeedOperator op) { // CEED_EVAL_NONE CeedScalar *identity = NULL; bool has_eval_none = false; - for (CeedInt i = 0; i < num_e_mode_in; i++) has_eval_none = has_eval_none || (e_mode_in[i] == CEED_EVAL_NONE); - for (CeedInt i = 0; i < num_e_mode_out; i++) has_eval_none = has_eval_none || (e_mode_out[i] == CEED_EVAL_NONE); + for (CeedInt i = 0; i < num_eval_mode_in; i++) has_eval_none = has_eval_none || (eval_mode_in[i] == CEED_EVAL_NONE); + for (CeedInt i = 0; i < num_eval_mode_out; i++) has_eval_none = has_eval_none || (eval_mode_out[i] == CEED_EVAL_NONE); std::vector e; @@ -785,20 +802,27 @@ static inline int CeedOperatorAssembleDiagonalSetup_Sycl(CeedOperator op) { sycl::event grad_out_copy = sycl_data->sycl_queue.copy(grad_out, diag->d_grad_out, g_len, e); copy_events.push_back(grad_out_copy); - // Arrays of e_modes - CeedCallSycl(ceed, diag->d_e_mode_in = sycl::malloc_device(num_e_mode_in, sycl_data->sycl_device, sycl_data->sycl_context)); - sycl::event e_mode_in_copy = sycl_data->sycl_queue.copy(e_mode_in, diag->d_e_mode_in, num_e_mode_in, e); - copy_events.push_back(e_mode_in_copy); + // Arrays of eval_modes + CeedCallSycl(ceed, diag->d_eval_mode_in = sycl::malloc_device(num_eval_mode_in, sycl_data->sycl_device, sycl_data->sycl_context)); + sycl::event eval_mode_in_copy = sycl_data->sycl_queue.copy(eval_mode_in, diag->d_eval_mode_in, num_eval_mode_in, e); + copy_events.push_back(eval_mode_in_copy); - CeedCallSycl(ceed, diag->d_e_mode_out = sycl::malloc_device(num_e_mode_out, sycl_data->sycl_device, sycl_data->sycl_context)); - sycl::event e_mode_out_copy = sycl_data->sycl_queue.copy(e_mode_out, diag->d_e_mode_out, num_e_mode_out, e); - copy_events.push_back(e_mode_out_copy); + CeedCallSycl(ceed, diag->d_eval_mode_out = sycl::malloc_device(num_eval_mode_out, sycl_data->sycl_device, sycl_data->sycl_context)); + sycl::event eval_mode_out_copy = sycl_data->sycl_queue.copy(eval_mode_out, diag->d_eval_mode_out, num_eval_mode_out, e); + copy_events.push_back(eval_mode_out_copy); // Restriction - diag->diag_rstr = rstr_out; + CeedCallBackend(CeedElemRestrictionReferenceCopy(rstr_out, &diag->diag_rstr)); + CeedCallBackend(CeedElemRestrictionDestroy(&rstr_out)); // Wait for all copies to complete and handle exceptions CeedCallSycl(ceed, sycl::event::wait_and_throw(copy_events)); + + // Cleanup + CeedCallBackend(CeedDestroy(&ceed)); + CeedCallBackend(CeedBasisDestroy(&basis_in)); + CeedCallBackend(CeedBasisDestroy(&basis_out)); + CeedCallBackend(CeedQFunctionDestroy(&qf)); return CEED_ERROR_SUCCESS; } @@ -807,18 +831,18 @@ static inline int CeedOperatorAssembleDiagonalSetup_Sycl(CeedOperator op) { //------------------------------------------------------------------------------ static int CeedOperatorLinearDiagonal_Sycl(sycl::queue &sycl_queue, const bool is_point_block, const CeedInt num_elem, const CeedOperatorDiag_Sycl *diag, const CeedScalar *assembled_qf_array, CeedScalar *elem_diag_array) { - const CeedSize num_nodes = diag->num_nodes; - const CeedSize num_qpts = diag->num_qpts; - const CeedSize num_comp = diag->num_comp; - const CeedSize num_e_mode_in = diag->num_e_mode_in; - const CeedSize num_e_mode_out = diag->num_e_mode_out; - const CeedScalar *identity = diag->d_identity; - const CeedScalar *interp_in = diag->d_interp_in; - const CeedScalar *grad_in = diag->d_grad_in; - const CeedScalar *interp_out = diag->d_interp_out; - const CeedScalar *grad_out = diag->d_grad_out; - const CeedEvalMode *e_mode_in = diag->d_e_mode_in; - const CeedEvalMode *e_mode_out = diag->d_e_mode_out; + const CeedSize num_nodes = diag->num_nodes; + const CeedSize num_qpts = diag->num_qpts; + const CeedSize num_comp = diag->num_comp; + const CeedSize num_eval_mode_in = diag->num_eval_mode_in; + const CeedSize num_eval_mode_out = diag->num_eval_mode_out; + const CeedScalar *identity = diag->d_identity; + const CeedScalar *interp_in = diag->d_interp_in; + const CeedScalar *grad_in = diag->d_grad_in; + const CeedScalar *interp_out = diag->d_interp_out; + const CeedScalar *grad_out = diag->d_grad_out; + const CeedEvalMode *eval_mode_in = diag->d_eval_mode_in; + const CeedEvalMode *eval_mode_out = diag->d_eval_mode_out; sycl::range<1> kernel_range(num_elem * num_nodes); @@ -834,18 +858,18 @@ static int CeedOperatorLinearDiagonal_Sycl(sycl::queue &sycl_queue, const bool i // Each element CeedInt d_out = -1; // Each basis eval mode pair - for (CeedSize e_out = 0; e_out < num_e_mode_out; e_out++) { + for (CeedSize e_out = 0; e_out < num_eval_mode_out; e_out++) { const CeedScalar *bt = NULL; - if (e_mode_out[e_out] == CEED_EVAL_GRAD) ++d_out; - CeedOperatorGetBasisPointer_Sycl(&bt, e_mode_out[e_out], identity, interp_out, &grad_out[d_out * num_qpts * num_nodes]); + if (eval_mode_out[e_out] == CEED_EVAL_GRAD) ++d_out; + CeedOperatorGetBasisPointer_Sycl(&bt, eval_mode_out[e_out], identity, interp_out, &grad_out[d_out * num_qpts * num_nodes]); CeedInt d_in = -1; - for (CeedSize e_in = 0; e_in < num_e_mode_in; e_in++) { + for (CeedSize e_in = 0; e_in < num_eval_mode_in; e_in++) { const CeedScalar *b = NULL; - if (e_mode_in[e_in] == CEED_EVAL_GRAD) ++d_in; - CeedOperatorGetBasisPointer_Sycl(&b, e_mode_in[e_in], identity, interp_in, &grad_in[d_in * num_qpts * num_nodes]); + if (eval_mode_in[e_in] == CEED_EVAL_GRAD) ++d_in; + CeedOperatorGetBasisPointer_Sycl(&b, eval_mode_in[e_in], identity, interp_in, &grad_in[d_in * num_qpts * num_nodes]); // Each component for (CeedSize comp_out = 0; comp_out < num_comp; comp_out++) { // Each qpoint/node pair @@ -856,7 +880,7 @@ static int CeedOperatorLinearDiagonal_Sycl(sycl::queue &sycl_queue, const bool i for (CeedSize q = 0; q < num_qpts; q++) { const CeedScalar qf_value = - assembled_qf_array[((((e_in * num_comp + comp_in) * num_e_mode_out + e_out) * num_comp + comp_out) * num_elem + e) * num_qpts + + assembled_qf_array[((((e_in * num_comp + comp_in) * num_eval_mode_out + e_out) * num_comp + comp_out) * num_elem + e) * num_qpts + q]; e_value += bt[q * num_nodes + tid] * qf_value * b[q * num_nodes + tid]; @@ -869,7 +893,8 @@ static int CeedOperatorLinearDiagonal_Sycl(sycl::queue &sycl_queue, const bool i for (CeedSize q = 0; q < num_qpts; q++) { const CeedScalar qf_value = - assembled_qf_array[((((e_in * num_comp + comp_out) * num_e_mode_out + e_out) * num_comp + comp_out) * num_elem + e) * num_qpts + q]; + assembled_qf_array[((((e_in * num_comp + comp_out) * num_eval_mode_out + e_out) * num_comp + comp_out) * num_elem + e) * num_qpts + + q]; e_value += bt[q * num_nodes + tid] * qf_value * b[q * num_nodes + tid]; } elem_diag_array[(comp_out * num_elem + e) * num_nodes + tid] += e_value; @@ -885,22 +910,26 @@ static int CeedOperatorLinearDiagonal_Sycl(sycl::queue &sycl_queue, const bool i // Assemble diagonal common code //------------------------------------------------------------------------------ static inline int CeedOperatorAssembleDiagonalCore_Sycl(CeedOperator op, CeedVector assembled, CeedRequest *request, const bool is_point_block) { - Ceed ceed; - Ceed_Sycl *sycl_data; - CeedInt num_elem; - CeedScalar *elem_diag_array; - const CeedScalar *assembled_qf_array; - CeedVector assembled_qf = NULL; - CeedElemRestriction rstr = NULL; - CeedOperator_Sycl *impl; + Ceed ceed; + Ceed_Sycl *sycl_data; + CeedInt num_elem; + CeedScalar *elem_diag_array; + const CeedScalar *assembled_qf_array; + CeedVector assembled_qf = NULL; + CeedOperator_Sycl *impl; CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); - CeedCallBackend(CeedOperatorGetData(op, &impl)); CeedCallBackend(CeedGetData(ceed, &sycl_data)); + CeedCallBackend(CeedDestroy(&ceed)); + CeedCallBackend(CeedOperatorGetData(op, &impl)); // Assemble QFunction - CeedCallBackend(CeedOperatorLinearAssembleQFunctionBuildOrUpdate(op, &assembled_qf, &rstr, request)); - CeedCallBackend(CeedElemRestrictionDestroy(&rstr)); + { + CeedElemRestriction elem_rstr = NULL; + + CeedCallBackend(CeedOperatorLinearAssembleQFunctionBuildOrUpdate(op, &assembled_qf, &elem_rstr, request)); + CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); + } // Setup if (!impl->diag) { @@ -968,9 +997,9 @@ static int CeedOperatorLinearAssembleAddPointBlockDiagonal_Sycl(CeedOperator op, //------------------------------------------------------------------------------ // Single operator assembly setup //------------------------------------------------------------------------------ -static int CeedSingleOperatorAssembleSetup_Sycl(CeedOperator op) { +static int CeedOperatorAssembleSingleSetup_Sycl(CeedOperator op) { Ceed ceed; - CeedInt num_input_fields, num_output_fields, num_e_mode_in = 0, dim = 1, num_B_in_mats_to_load = 0, size_B_in = 0, num_e_mode_out = 0, + CeedInt num_input_fields, num_output_fields, num_eval_mode_in = 0, dim = 1, num_B_in_mats_to_load = 0, size_B_in = 0, num_eval_mode_out = 0, num_B_out_mats_to_load = 0, size_B_out = 0, num_qpts = 0, elem_size = 0, num_elem, num_comp, mat_start = 0; CeedEvalMode *eval_mode_in = NULL, *eval_mode_out = NULL; @@ -992,63 +1021,81 @@ static int CeedSingleOperatorAssembleSetup_Sycl(CeedOperator op) { CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_fields, NULL, NULL)); // Note that the kernel will treat each dimension of a gradient action separately; - // i.e., when an active input has a CEED_EVAL_GRAD mode, num_ e_mode_in will increment by dim. - // However, for the purposes of load_ing the B matrices, it will be treated as one mode, and we will load/copy the entire gradient matrix at once, + // i.e., when an active input has a CEED_EVAL_GRAD mode, num_ eval_mode_in will increment by dim. + // However, for the purposes of loading the B matrices, it will be treated as one mode, and we will load/copy the entire gradient matrix at once, // so num_B_in_mats_to_load will be incremented by 1. for (CeedInt i = 0; i < num_input_fields; i++) { - CeedEvalMode eval_mode; - CeedVector vec; + CeedVector vec; CeedCallBackend(CeedOperatorFieldGetVector(input_fields[i], &vec)); if (vec == CEED_VECTOR_ACTIVE) { - CeedCallBackend(CeedOperatorFieldGetBasis(input_fields[i], &basis_in)); + CeedEvalMode eval_mode; + CeedElemRestriction elem_rstr; + CeedBasis basis; + + CeedCallBackend(CeedOperatorFieldGetElemRestriction(input_fields[i], &elem_rstr)); + if (!rstr_in) CeedCallBackend(CeedElemRestrictionReferenceCopy(elem_rstr, &rstr_in)); + CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); + CeedCallBackend(CeedElemRestrictionGetElementSize(rstr_in, &elem_size)); + CeedCallBackend(CeedOperatorFieldGetBasis(input_fields[i], &basis)); + if (!basis_in) CeedCallBackend(CeedBasisReferenceCopy(basis, &basis_in)); + CeedCheck(basis_in == basis, ceed, CEED_ERROR_BACKEND, "Backend does not implement operator assembly with multiple active bases"); + CeedCallBackend(CeedBasisDestroy(&basis)); CeedCallBackend(CeedBasisGetDimension(basis_in, &dim)); CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis_in, &num_qpts)); - CeedCallBackend(CeedOperatorFieldGetElemRestriction(input_fields[i], &rstr_in)); - CeedCallBackend(CeedElemRestrictionGetElementSize(rstr_in, &elem_size)); CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode)); if (eval_mode != CEED_EVAL_NONE) { CeedCallBackend(CeedRealloc(num_B_in_mats_to_load + 1, &eval_mode_in)); eval_mode_in[num_B_in_mats_to_load] = eval_mode; num_B_in_mats_to_load += 1; if (eval_mode == CEED_EVAL_GRAD) { - num_e_mode_in += dim; + num_eval_mode_in += dim; size_B_in += dim * elem_size * num_qpts; } else { - num_e_mode_in += 1; + num_eval_mode_in += 1; size_B_in += elem_size * num_qpts; } } } + CeedCallBackend(CeedVectorDestroy(&vec)); } // Determine active output basis; basis_out and rstr_out only used if same as input, TODO CeedCallBackend(CeedQFunctionGetFields(qf, NULL, NULL, NULL, &qf_fields)); for (CeedInt i = 0; i < num_output_fields; i++) { - CeedEvalMode eval_mode; - CeedVector vec; + CeedVector vec; CeedCallBackend(CeedOperatorFieldGetVector(output_fields[i], &vec)); if (vec == CEED_VECTOR_ACTIVE) { - CeedCallBackend(CeedOperatorFieldGetBasis(output_fields[i], &basis_out)); - CeedCallBackend(CeedOperatorFieldGetElemRestriction(output_fields[i], &rstr_out)); - CeedCheck(!rstr_out || rstr_out == rstr_in, ceed, CEED_ERROR_BACKEND, "Backend does not implement multi-field non-composite operator assembly"); + CeedEvalMode eval_mode; + CeedElemRestriction elem_rstr; + CeedBasis basis; + + CeedCallBackend(CeedOperatorFieldGetElemRestriction(output_fields[i], &elem_rstr)); + if (!rstr_out) CeedCallBackend(CeedElemRestrictionReferenceCopy(elem_rstr, &rstr_out)); + CeedCheck(rstr_out == rstr_in, ceed, CEED_ERROR_BACKEND, "Backend does not implement multi-field non-composite operator assembly"); + CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); + CeedCallBackend(CeedOperatorFieldGetBasis(output_fields[i], &basis)); + if (!basis_out) CeedCallBackend(CeedBasisReferenceCopy(basis, &basis_out)); + CeedCheck(basis_out == basis, ceed, CEED_ERROR_BACKEND, "Backend does not implement operator assembly with multiple active bases"); + CeedCallBackend(CeedBasisDestroy(&basis)); CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode)); if (eval_mode != CEED_EVAL_NONE) { CeedCallBackend(CeedRealloc(num_B_out_mats_to_load + 1, &eval_mode_out)); eval_mode_out[num_B_out_mats_to_load] = eval_mode; num_B_out_mats_to_load += 1; if (eval_mode == CEED_EVAL_GRAD) { - num_e_mode_out += dim; + num_eval_mode_out += dim; size_B_out += dim * elem_size * num_qpts; } else { - num_e_mode_out += 1; + num_eval_mode_out += 1; size_B_out += elem_size * num_qpts; } } } + CeedCallBackend(CeedVectorDestroy(&vec)); } - CeedCheck(num_e_mode_in > 0 && num_e_mode_out > 0, ceed, CEED_ERROR_UNSUPPORTED, "Cannot assemble operator without inputs/outputs"); + CeedCheck(num_eval_mode_in > 0 && num_eval_mode_out > 0, ceed, CEED_ERROR_UNSUPPORTED, "Cannot assemble operator without inputs/outputs"); CeedCallBackend(CeedElemRestrictionGetNumElements(rstr_in, &num_elem)); CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr_in, &num_comp)); @@ -1061,16 +1108,16 @@ static int CeedSingleOperatorAssembleSetup_Sycl(CeedOperator op) { CeedCallBackend(CeedGetData(ceed, &sycl_data)); // Kernel setup - int elems_per_block = 1; - asmb->elems_per_block = elems_per_block; - asmb->block_size_x = elem_size; - asmb->block_size_y = elem_size; - asmb->num_e_mode_in = num_e_mode_in; - asmb->num_e_mode_out = num_e_mode_out; - asmb->num_qpts = num_qpts; - asmb->num_nodes = elem_size; - asmb->block_size = elem_size * elem_size * elems_per_block; - asmb->num_comp = num_comp; + int elems_per_block = 1; + asmb->elems_per_block = elems_per_block; + asmb->block_size_x = elem_size; + asmb->block_size_y = elem_size; + asmb->num_eval_mode_in = num_eval_mode_in; + asmb->num_eval_mode_out = num_eval_mode_out; + asmb->num_qpts = num_qpts; + asmb->num_nodes = elem_size; + asmb->block_size = elem_size * elem_size * elems_per_block; + asmb->num_comp = num_comp; // Build 'full' B matrices (not 1D arrays used for tensor-product matrices CeedCallBackend(CeedBasisGetInterp(basis_in, &interp_in)); @@ -1127,6 +1174,12 @@ static int CeedSingleOperatorAssembleSetup_Sycl(CeedOperator op) { mat_start += dim * elem_size * num_qpts; } } + CeedCallBackend(CeedDestroy(&ceed)); + CeedCallBackend(CeedElemRestrictionDestroy(&rstr_in)); + CeedCallBackend(CeedElemRestrictionDestroy(&rstr_out)); + CeedCallBackend(CeedBasisDestroy(&basis_in)); + CeedCallBackend(CeedBasisDestroy(&basis_out)); + CeedCallBackend(CeedQFunctionDestroy(&qf)); return CEED_ERROR_SUCCESS; } @@ -1137,25 +1190,25 @@ static int CeedOperatorLinearAssemble_Sycl(sycl::queue &sycl_queue, const CeedOp CeedScalar *values_array) { // This kernels assumes B_in and B_out have the same number of quadrature points and basis points. // TODO: expand to more general cases - CeedOperatorAssemble_Sycl *asmb = impl->asmb; - const CeedInt num_elem = asmb->num_elem; - const CeedSize num_nodes = asmb->num_nodes; - const CeedSize num_comp = asmb->num_comp; - const CeedSize num_qpts = asmb->num_qpts; - const CeedSize num_e_mode_in = asmb->num_e_mode_in; - const CeedSize num_e_mode_out = asmb->num_e_mode_out; + CeedOperatorAssemble_Sycl *asmb = impl->asmb; + const CeedInt num_elem = asmb->num_elem; + const CeedSize num_nodes = asmb->num_nodes; + const CeedSize num_comp = asmb->num_comp; + const CeedSize num_qpts = asmb->num_qpts; + const CeedSize num_eval_mode_in = asmb->num_eval_mode_in; + const CeedSize num_eval_mode_out = asmb->num_eval_mode_out; // Strides for final output ordering, determined by the reference (inference) implementation of the symbolic assembly, slowest --> fastest: element, // comp_in, comp_out, node_row, node_col const CeedSize comp_out_stride = num_nodes * num_nodes; const CeedSize comp_in_stride = comp_out_stride * num_comp; const CeedSize e_stride = comp_in_stride * num_comp; - // Strides for QF array, slowest --> fastest: e_mode_in, comp_in, e_mode_out, comp_out, elem, qpt - const CeedSize q_e_stride = num_qpts; - const CeedSize q_comp_out_stride = num_elem * q_e_stride; - const CeedSize q_e_mode_out_stride = q_comp_out_stride * num_comp; - const CeedSize q_comp_in_stride = q_e_mode_out_stride * num_e_mode_out; - const CeedSize q_e_mode_in_stride = q_comp_in_stride * num_comp; + // Strides for QF array, slowest --> fastest: eval_mode_in, comp_in, eval_mode_out, comp_out, elem, qpt + const CeedSize q_e_stride = num_qpts; + const CeedSize q_comp_out_stride = num_elem * q_e_stride; + const CeedSize q_eval_mode_out_stride = q_comp_out_stride * num_comp; + const CeedSize q_comp_in_stride = q_eval_mode_out_stride * num_eval_mode_out; + const CeedSize q_eval_mode_in_stride = q_comp_in_stride * num_comp; CeedScalar *B_in, *B_out; B_in = asmb->d_B_in; @@ -1178,24 +1231,24 @@ static int CeedOperatorLinearAssemble_Sycl(sycl::queue &sycl_queue, const CeedOp CeedScalar result = 0.0; CeedSize qf_index_comp = q_comp_in_stride * comp_in + q_comp_out_stride * comp_out + q_e_stride * e; - for (CeedSize e_mode_in = 0; e_mode_in < num_e_mode_in; e_mode_in++) { - CeedSize b_in_index = e_mode_in * num_qpts * num_nodes; + for (CeedSize eval_mode_in = 0; eval_mode_in < num_eval_mode_in; eval_mode_in++) { + CeedSize b_in_index = eval_mode_in * num_qpts * num_nodes; - for (CeedSize e_mode_out = 0; e_mode_out < num_e_mode_out; e_mode_out++) { - CeedSize b_out_index = e_mode_out * num_qpts * num_nodes; - CeedSize qf_index = qf_index_comp + q_e_mode_out_stride * e_mode_out + q_e_mode_in_stride * e_mode_in; + for (CeedSize eval_mode_out = 0; eval_mode_out < num_eval_mode_out; eval_mode_out++) { + CeedSize b_out_index = eval_mode_out * num_qpts * num_nodes; + CeedSize qf_index = qf_index_comp + q_eval_mode_out_stride * eval_mode_out + q_eval_mode_in_stride * eval_mode_in; // Perform the B^T D B operation for this 'chunk' of D (the qf_array) for (CeedSize j = 0; j < num_qpts; j++) { result += B_out[b_out_index + j * num_nodes + i] * qf_array[qf_index + j] * B_in[b_in_index + j * num_nodes + l]; } - } // end of e_mode_out - } // end of e_mode_in + } // end of eval_mode_out + } // end of eval_mode_in CeedSize val_index = comp_in_stride * comp_in + comp_out_stride * comp_out + e_stride * e + num_nodes * i + l; values_array[val_index] = result; } // end of out component - } // end of in component + } // end of in component }); return CEED_ERROR_SUCCESS; } @@ -1213,20 +1266,20 @@ static int CeedOperatorLinearAssembleFallback_Sycl(sycl::queue &sycl_queue, cons const CeedInt num_nodes = asmb->num_nodes; const CeedInt num_comp = asmb->num_comp; const CeedInt num_qpts = asmb->num_qpts; - const CeedInt num_e_mode_in = asmb->num_e_mode_in; - const CeedInt num_e_mode_out = asmb->num_e_mode_out; + const CeedInt num_eval_mode_in = asmb->num_eval_mode_in; + const CeedInt num_eval_mode_out = asmb->num_eval_mode_out; // Strides for final output ordering, determined by the reference (interface) implementation of the symbolic assembly, slowest --> fastest: elememt, // comp_in, comp_out, node_row, node_col const CeedInt comp_out_stride = num_nodes * num_nodes; const CeedInt comp_in_stride = comp_out_stride * num_comp; const CeedInt e_stride = comp_in_stride * num_comp; - // Strides for QF array, slowest --> fastest: e_mode_in, comp_in, e_mode_out, comp_out, elem, qpt + // Strides for QF array, slowest --> fastest: eval_mode_in, comp_in, eval_mode_out, comp_out, elem, qpt const CeedInt q_e_stride = num_qpts; const CeedInt q_comp_out_stride = num_elem * q_e_stride; - const CeedInt q_e_mode_out_stride = q_comp_out_stride * num_comp; - const CeedInt q_comp_in_stride = q_e_mode_out_stride * num_e_mode_out; - const CeedInt q_e_mode_in_stride = q_comp_in_stride * num_comp; + const CeedInt q_eval_mode_out_stride = q_comp_out_stride * num_comp; + const CeedInt q_comp_in_stride = q_eval_mode_out_stride * num_eval_mode_out; + const CeedInt q_eval_mode_in_stride = q_comp_in_stride * num_comp; CeedScalar *B_in, *B_out; B_in = asmb->d_B_in; @@ -1255,17 +1308,17 @@ static int CeedOperatorLinearAssembleFallback_Sycl(sycl::queue &sycl_queue, cons for (CeedInt i = 0; i < num_nodes; i++) { CeedScalar result = 0.0; CeedInt qf_index_comp = q_comp_in_stride * comp_in + q_comp_out_stride * comp_out + q_e_stride * e; - for (CeedInt e_mode_in = 0; e_mode_in < num_e_mode_in; e_mode_in++) { - CeedInt b_in_index = e_mode_in * num_qpts * num_nodes; - for (CeedInt e_mode_out = 0; e_mode_out < num_e_mode_out; e_mode_out++) { - CeedInt b_out_index = e_mode_out * num_qpts * num_nodes; - CeedInt qf_index = qf_index_comp + q_e_mode_out_stride * e_mode_out + q_e_mode_in_stride * e_mode_in; + for (CeedInt eval_mode_in = 0; eval_mode_in < num_eval_mode_in; eval_mode_in++) { + CeedInt b_in_index = eval_mode_in * num_qpts * num_nodes; + for (CeedInt eval_mode_out = 0; eval_mode_out < num_eval_mode_out; eval_mode_out++) { + CeedInt b_out_index = eval_mode_out * num_qpts * num_nodes; + CeedInt qf_index = qf_index_comp + q_eval_mode_out_stride * eval_mode_out + q_eval_mode_in_stride * eval_mode_in; // Perform the B^T D B operation for this 'chunk' of D (the qf_array) for (CeedInt j = 0; j < num_qpts; j++) { result += B_out[b_out_index + j * num_nodes + i] * qf_array[qf_index + j] * B_in[b_in_index + j * num_nodes + l]; } - } // end of e_mode_out - } // end of e_mode_in + } // end of eval_mode_out + } // end of eval_mode_in CeedInt val_index = comp_in_stride * comp_in + comp_out_stride * comp_out + e_stride * e + num_nodes * i + l; values_array[val_index] = result; } // end of loop over element node index, i @@ -1284,7 +1337,7 @@ static int CeedOperatorLinearAssembleFallback_Sycl(sycl::queue &sycl_queue, cons // input restriction/basis per operator (could have multiple basis eval modes). // TODO: allow multiple active input restrictions/basis objects //------------------------------------------------------------------------------ -static int CeedSingleOperatorAssemble_Sycl(CeedOperator op, CeedInt offset, CeedVector values) { +static int CeedOperatorAssembleSingle_Sycl(CeedOperator op, CeedInt offset, CeedVector values) { Ceed ceed; Ceed_Sycl *sycl_data; CeedScalar *values_array; @@ -1294,12 +1347,13 @@ static int CeedSingleOperatorAssemble_Sycl(CeedOperator op, CeedInt offset, Ceed CeedOperator_Sycl *impl; CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); - CeedCallBackend(CeedOperatorGetData(op, &impl)); CeedCallBackend(CeedGetData(ceed, &sycl_data)); + CeedCallBackend(CeedDestroy(&ceed)); + CeedCallBackend(CeedOperatorGetData(op, &impl)); // Setup if (!impl->asmb) { - CeedCallBackend(CeedSingleOperatorAssembleSetup_Sycl(op)); + CeedCallBackend(CeedOperatorAssembleSingleSetup_Sycl(op)); assert(impl->asmb != NULL); } @@ -1341,11 +1395,12 @@ int CeedOperatorCreate_Sycl(CeedOperator op) { CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Operator", op, "LinearAssembleQFunction", CeedOperatorLinearAssembleQFunction_Sycl)); CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Operator", op, "LinearAssembleQFunctionUpdate", CeedOperatorLinearAssembleQFunctionUpdate_Sycl)); CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Operator", op, "LinearAssembleAddDiagonal", CeedOperatorLinearAssembleAddDiagonal_Sycl)); - CeedCallBackend( - CeedSetBackendFunctionCpp(ceed, "Operator", op, "LinearAssembleAddPointBlockDiagonal", CeedOperatorLinearAssembleAddPointBlockDiagonal_Sycl)); - CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Operator", op, "LinearAssembleSingle", CeedSingleOperatorAssemble_Sycl)); + CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Operator", op, "LinearAssembleAddPointBlockDiagonal", + CeedOperatorLinearAssembleAddPointBlockDiagonal_Sycl)); + CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Operator", op, "LinearAssembleSingle", CeedOperatorAssembleSingle_Sycl)); CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Operator", op, "ApplyAdd", CeedOperatorApplyAdd_Sycl)); CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Operator", op, "Destroy", CeedOperatorDestroy_Sycl)); + CeedCallBackend(CeedDestroy(&ceed)); return CEED_ERROR_SUCCESS; } diff --git a/backends/sycl-ref/ceed-sycl-ref-qfunction-load.hpp b/backends/sycl-ref/ceed-sycl-ref-qfunction-load.hpp index 23e792f90e..7a68343f29 100644 --- a/backends/sycl-ref/ceed-sycl-ref-qfunction-load.hpp +++ b/backends/sycl-ref/ceed-sycl-ref-qfunction-load.hpp @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause diff --git a/backends/sycl-ref/ceed-sycl-ref-qfunction-load.sycl.cpp b/backends/sycl-ref/ceed-sycl-ref-qfunction-load.sycl.cpp index 759b9b9a5a..82cac87b6d 100644 --- a/backends/sycl-ref/ceed-sycl-ref-qfunction-load.sycl.cpp +++ b/backends/sycl-ref/ceed-sycl-ref-qfunction-load.sycl.cpp @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -35,8 +35,8 @@ extern "C" int CeedQFunctionBuildKernel_Sycl(CeedQFunction qf) { CeedQFunctionField *input_fields, *output_fields; CeedQFunction_Sycl *impl; - CeedCallBackend(CeedQFunctionGetData(qf, (void **)&impl)); // QFunction is built + CeedCallBackend(CeedQFunctionGetData(qf, (void **)&impl)); if (impl->QFunction) return CEED_ERROR_SUCCESS; CeedCallBackend(CeedQFunctionGetCeed(qf, &ceed)); @@ -118,7 +118,7 @@ extern "C" int CeedQFunctionBuildKernel_Sycl(CeedQFunction qf) { for (CeedInt i = 0; i < num_input_fields; ++i) { code << " CeedScalar U_" << i << "[" << input_sizes[i] << "];\n"; } - code << " const CeedScalar *inputs[" << num_input_fields << "] = {U_0"; + code << " const CeedScalar *inputs[" << CeedIntMax(num_input_fields, 1) << "] = {U_0"; for (CeedInt i = 1; i < num_input_fields; i++) { code << ", U_" << i << "\n"; } @@ -129,7 +129,7 @@ extern "C" int CeedQFunctionBuildKernel_Sycl(CeedQFunction qf) { for (CeedInt i = 0; i < num_output_fields; i++) { code << " CeedScalar V_" << i << "[" << output_sizes[i] << "];\n"; } - code << " CeedScalar *outputs[" << num_output_fields << "] = {V_0"; + code << " CeedScalar *outputs[" << CeedIntMax(num_output_fields, 1) << "] = {V_0"; for (CeedInt i = 1; i < num_output_fields; i++) { code << ", V_" << i << "\n"; } @@ -175,6 +175,7 @@ extern "C" int CeedQFunctionBuildKernel_Sycl(CeedQFunction qf) { CeedCallBackend(CeedFree(&qfunction_source)); CeedCallBackend(CeedFree(&read_write_kernel_path)); CeedCallBackend(CeedFree(&read_write_kernel_source)); + CeedCallBackend(CeedDestroy(&ceed)); return CEED_ERROR_SUCCESS; } diff --git a/backends/sycl-ref/ceed-sycl-ref-qfunction.sycl.cpp b/backends/sycl-ref/ceed-sycl-ref-qfunction.sycl.cpp index 4de8fcf379..fb0ad6f287 100644 --- a/backends/sycl-ref/ceed-sycl-ref-qfunction.sycl.cpp +++ b/backends/sycl-ref/ceed-sycl-ref-qfunction.sycl.cpp @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other // CEED contributors. All Rights Reserved. See the top-level LICENSE and NOTICE // files for details. // @@ -37,6 +37,7 @@ static int CeedQFunctionApply_Sycl(CeedQFunction qf, CeedInt Q, CeedVector *U, C CeedCallBackend(CeedQFunctionGetCeed(qf, &ceed)); CeedCallBackend(CeedGetData(ceed, &ceed_Sycl)); + CeedCallBackend(CeedDestroy(&ceed)); CeedCallBackend(CeedQFunctionGetNumArgs(qf, &num_input_fields, &num_output_fields)); @@ -118,6 +119,7 @@ static int CeedQFunctionDestroy_Sycl(CeedQFunction qf) { delete impl->QFunction; delete impl->sycl_module; CeedCallBackend(CeedFree(&impl)); + CeedCallBackend(CeedDestroy(&ceed)); return CEED_ERROR_SUCCESS; } @@ -134,6 +136,7 @@ int CeedQFunctionCreate_Sycl(CeedQFunction qf) { // Register backend functions CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "QFunction", qf, "Apply", CeedQFunctionApply_Sycl)); CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "QFunction", qf, "Destroy", CeedQFunctionDestroy_Sycl)); + CeedCallBackend(CeedDestroy(&ceed)); return CEED_ERROR_SUCCESS; } diff --git a/backends/sycl-ref/ceed-sycl-ref-qfunctioncontext.sycl.cpp b/backends/sycl-ref/ceed-sycl-ref-qfunctioncontext.sycl.cpp index 1c942a645b..1a08c26cb5 100644 --- a/backends/sycl-ref/ceed-sycl-ref-qfunctioncontext.sycl.cpp +++ b/backends/sycl-ref/ceed-sycl-ref-qfunctioncontext.sycl.cpp @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -42,6 +42,7 @@ static inline int CeedQFunctionContextSyncH2D_Sycl(const CeedQFunctionContext ct if (!sycl_data->sycl_queue.is_in_order()) e = {sycl_data->sycl_queue.ext_oneapi_submit_barrier()}; sycl::event copy_event = sycl_data->sycl_queue.memcpy(impl->d_data, impl->h_data, ctx_size, e); CeedCallSycl(ceed, copy_event.wait_and_throw()); + CeedCallBackend(CeedDestroy(&ceed)); return CEED_ERROR_SUCCESS; } @@ -75,6 +76,7 @@ static inline int CeedQFunctionContextSyncD2H_Sycl(const CeedQFunctionContext ct if (!sycl_data->sycl_queue.is_in_order()) e = {sycl_data->sycl_queue.ext_oneapi_submit_barrier()}; sycl::event copy_event = sycl_data->sycl_queue.memcpy(impl->h_data, impl->d_data, ctx_size, e); CeedCallSycl(ceed, copy_event.wait_and_throw()); + CeedCallBackend(CeedDestroy(&ceed)); return CEED_ERROR_SUCCESS; } @@ -88,7 +90,9 @@ static inline int CeedQFunctionContextSync_Sycl(const CeedQFunctionContext ctx, case CEED_MEM_DEVICE: return CeedQFunctionContextSyncH2D_Sycl(ctx); } + // LCOV_EXCL_START return CEED_ERROR_UNSUPPORTED; + // LCOV_EXCL_STOP } //------------------------------------------------------------------------------ @@ -229,6 +233,7 @@ static int CeedQFunctionContextSetDataDevice_Sycl(const CeedQFunctionContext ctx impl->d_data = data; } break; } + CeedCallBackend(CeedDestroy(&ceed)); return CEED_ERROR_SUCCESS; } @@ -237,9 +242,6 @@ static int CeedQFunctionContextSetDataDevice_Sycl(const CeedQFunctionContext ctx // freeing any previously allocated data if applicable //------------------------------------------------------------------------------ static int CeedQFunctionContextSetData_Sycl(const CeedQFunctionContext ctx, const CeedMemType mem_type, const CeedCopyMode copy_mode, void *data) { - Ceed ceed; - - CeedCallBackend(CeedQFunctionContextGetCeed(ctx, &ceed)); CeedCallBackend(CeedQFunctionContextSetAllInvalid_Sycl(ctx)); switch (mem_type) { case CEED_MEM_HOST: @@ -247,7 +249,9 @@ static int CeedQFunctionContextSetData_Sycl(const CeedQFunctionContext ctx, cons case CEED_MEM_DEVICE: return CeedQFunctionContextSetDataDevice_Sycl(ctx, copy_mode, data); } + // LCOV_EXCL_START return CEED_ERROR_UNSUPPORTED; + // LCOV_EXCL_STOP } //------------------------------------------------------------------------------ @@ -260,8 +264,9 @@ static int CeedQFunctionContextTakeData_Sycl(const CeedQFunctionContext ctx, con CeedQFunctionContext_Sycl *impl; CeedCallBackend(CeedQFunctionContextGetCeed(ctx, &ceed)); - CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl)); CeedCallBackend(CeedGetData(ceed, &ceedSycl)); + CeedCallBackend(CeedDestroy(&ceed)); + CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl)); // Order queue if needed if (!ceedSycl->sycl_queue.is_in_order()) ceedSycl->sycl_queue.ext_oneapi_submit_barrier(); @@ -291,11 +296,9 @@ static int CeedQFunctionContextTakeData_Sycl(const CeedQFunctionContext ctx, con // If a different memory type is most up to date, this will perform a copy //------------------------------------------------------------------------------ static int CeedQFunctionContextGetDataCore_Sycl(const CeedQFunctionContext ctx, const CeedMemType mem_type, void *data) { - Ceed ceed; bool need_sync = false; CeedQFunctionContext_Sycl *impl; - CeedCallBackend(CeedQFunctionContextGetCeed(ctx, &ceed)); CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl)); // Sync data to requested mem_type @@ -325,11 +328,9 @@ static int CeedQFunctionContextGetDataRead_Sycl(const CeedQFunctionContext ctx, // Get read/write access to the data //------------------------------------------------------------------------------ static int CeedQFunctionContextGetData_Sycl(const CeedQFunctionContext ctx, const CeedMemType mem_type, void *data) { - Ceed ceed; CeedQFunctionContext_Sycl *impl; CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl)); - CeedCallBackend(CeedQFunctionContextGetCeed(ctx, &ceed)); CeedCallBackend(CeedQFunctionContextGetDataCore_Sycl(ctx, mem_type, data)); // Mark only pointer for requested memory as valid @@ -360,6 +361,7 @@ static int CeedQFunctionContextDestroy_Sycl(const CeedQFunctionContext ctx) { // Wait for all work to finish before freeing memory CeedCallSycl(ceed, sycl_data->sycl_queue.wait_and_throw()); CeedCallSycl(ceed, sycl::free(impl->d_data_owned, sycl_data->sycl_context)); + CeedCallBackend(CeedDestroy(&ceed)); CeedCallBackend(CeedFree(&impl->h_data_owned)); CeedCallBackend(CeedFree(&impl)); return CEED_ERROR_SUCCESS; @@ -380,6 +382,7 @@ int CeedQFunctionContextCreate_Sycl(CeedQFunctionContext ctx) { CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "QFunctionContext", ctx, "GetData", CeedQFunctionContextGetData_Sycl)); CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "QFunctionContext", ctx, "GetDataRead", CeedQFunctionContextGetDataRead_Sycl)); CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "QFunctionContext", ctx, "Destroy", CeedQFunctionContextDestroy_Sycl)); + CeedCallBackend(CeedDestroy(&ceed)); CeedCallBackend(CeedCalloc(1, &impl)); CeedCallBackend(CeedQFunctionContextSetBackendData(ctx, impl)); return CEED_ERROR_SUCCESS; diff --git a/backends/sycl-ref/ceed-sycl-ref.hpp b/backends/sycl-ref/ceed-sycl-ref.hpp index ae765dbafc..723fcc8cfb 100644 --- a/backends/sycl-ref/ceed-sycl-ref.hpp +++ b/backends/sycl-ref/ceed-sycl-ref.hpp @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other // CEED contributors. All Rights Reserved. See the top-level LICENSE and NOTICE // files for details. // @@ -86,16 +86,16 @@ typedef struct { CeedBasis basis_in, basis_out; CeedElemRestriction diag_rstr, point_block_diag_rstr; CeedVector elem_diag, point_block_elem_diag; - CeedInt num_e_mode_in, num_e_mode_out, num_nodes; + CeedInt num_eval_mode_in, num_eval_mode_out, num_nodes; CeedInt num_qpts, num_comp; // Kernel parameters - CeedEvalMode *h_e_mode_in, *h_e_mode_out; - CeedEvalMode *d_e_mode_in, *d_e_mode_out; + CeedEvalMode *h_eval_mode_in, *h_eval_mode_out; + CeedEvalMode *d_eval_mode_in, *d_eval_mode_out; CeedScalar *d_identity, *d_interp_in, *d_interp_out, *d_grad_in, *d_grad_out; } CeedOperatorDiag_Sycl; typedef struct { CeedInt num_elem, block_size_x, block_size_y, elems_per_block; - CeedInt num_e_mode_in, num_e_mode_out, num_qpts, num_nodes, block_size, num_comp; // Kernel parameters + CeedInt num_eval_mode_in, num_eval_mode_out, num_qpts, num_nodes, block_size, num_comp; // Kernel parameters bool fallback; CeedScalar *d_B_in, *d_B_out; } CeedOperatorAssemble_Sycl; @@ -106,7 +106,6 @@ typedef struct { CeedVector *q_vecs_out; // Output Q-vectors needed to apply operator CeedInt num_e_in; CeedInt num_e_out; - CeedInt num_inputs, num_outputs; CeedInt num_active_in, num_active_out; CeedVector *qf_active_in; CeedOperatorDiag_Sycl *diag; diff --git a/backends/sycl-ref/ceed-sycl-ref.sycl.cpp b/backends/sycl-ref/ceed-sycl-ref.sycl.cpp index 6229003cb4..ffa5a78d7d 100644 --- a/backends/sycl-ref/ceed-sycl-ref.sycl.cpp +++ b/backends/sycl-ref/ceed-sycl-ref.sycl.cpp @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other // CEED contributors. All Rights Reserved. See the top-level LICENSE and NOTICE // files for details. // diff --git a/backends/sycl-ref/ceed-sycl-restriction.sycl.cpp b/backends/sycl-ref/ceed-sycl-restriction.sycl.cpp index d85d036587..d33d135198 100644 --- a/backends/sycl-ref/ceed-sycl-restriction.sycl.cpp +++ b/backends/sycl-ref/ceed-sycl-restriction.sycl.cpp @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other // CEED contributors. All Rights Reserved. See the top-level LICENSE and NOTICE // files for details. // @@ -195,6 +195,7 @@ static int CeedElemRestrictionApply_Sycl(CeedElemRestriction rstr, CeedTranspose // Restore arrays CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); + CeedCallBackend(CeedDestroy(&ceed)); return CEED_ERROR_SUCCESS; } @@ -202,10 +203,8 @@ static int CeedElemRestrictionApply_Sycl(CeedElemRestriction rstr, CeedTranspose // Get offsets //------------------------------------------------------------------------------ static int CeedElemRestrictionGetOffsets_Sycl(CeedElemRestriction rstr, CeedMemType m_type, const CeedInt **offsets) { - Ceed ceed; CeedElemRestriction_Sycl *impl; - CeedCallBackend(CeedElemRestrictionGetCeed(rstr, &ceed)); CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); switch (m_type) { @@ -240,6 +239,7 @@ static int CeedElemRestrictionDestroy_Sycl(CeedElemRestriction rstr) { CeedCallSycl(ceed, sycl::free(impl->d_t_indices, data->sycl_context)); CeedCallSycl(ceed, sycl::free(impl->d_l_vec_indices, data->sycl_context)); CeedCallBackend(CeedFree(&impl)); + CeedCallBackend(CeedDestroy(&ceed)); return CEED_ERROR_SUCCESS; } @@ -328,6 +328,7 @@ static int CeedElemRestrictionOffset_Sycl(const CeedElemRestriction rstr, const CeedCallBackend(CeedFree(&l_vec_indices)); CeedCallBackend(CeedFree(&t_offsets)); CeedCallBackend(CeedFree(&t_indices)); + CeedCallBackend(CeedDestroy(&ceed)); return CEED_ERROR_SUCCESS; } @@ -472,5 +473,6 @@ int CeedElemRestrictionCreate_Sycl(CeedMemType mem_type, CeedCopyMode copy_mode, CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "ElemRestriction", rstr, "ApplyUnoriented", CeedElemRestrictionApply_Sycl)); CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "ElemRestriction", rstr, "GetOffsets", CeedElemRestrictionGetOffsets_Sycl)); CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "ElemRestriction", rstr, "Destroy", CeedElemRestrictionDestroy_Sycl)); + CeedCallBackend(CeedDestroy(&ceed)); return CEED_ERROR_SUCCESS; } diff --git a/backends/sycl-ref/ceed-sycl-vector.sycl.cpp b/backends/sycl-ref/ceed-sycl-vector.sycl.cpp index 427f51f727..689d84f78e 100644 --- a/backends/sycl-ref/ceed-sycl-vector.sycl.cpp +++ b/backends/sycl-ref/ceed-sycl-vector.sycl.cpp @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -44,8 +44,9 @@ static inline int CeedVectorSyncH2D_Sycl(const CeedVector vec) { CeedVector_Sycl *impl; CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); - CeedCallBackend(CeedVectorGetData(vec, &impl)); CeedCallBackend(CeedGetData(ceed, &data)); + CeedCallBackend(CeedVectorGetData(vec, &impl)); + CeedCheck(impl->h_array, ceed, CEED_ERROR_BACKEND, "No valid host data to sync to device"); CeedCallBackend(CeedVectorGetLength(vec, &length)); @@ -63,6 +64,7 @@ static inline int CeedVectorSyncH2D_Sycl(const CeedVector vec) { if (!data->sycl_queue.is_in_order()) e = {data->sycl_queue.ext_oneapi_submit_barrier()}; CeedCallSycl(ceed, data->sycl_queue.copy(impl->h_array, impl->d_array, length, e).wait_and_throw()); + CeedCallBackend(CeedDestroy(&ceed)); return CEED_ERROR_SUCCESS; } @@ -76,8 +78,8 @@ static inline int CeedVectorSyncD2H_Sycl(const CeedVector vec) { CeedVector_Sycl *impl; CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); - CeedCallBackend(CeedVectorGetData(vec, &impl)); CeedCallBackend(CeedGetData(ceed, &data)); + CeedCallBackend(CeedVectorGetData(vec, &impl)); CeedCheck(impl->d_array, ceed, CEED_ERROR_BACKEND, "No valid device data to sync to host"); @@ -96,6 +98,7 @@ static inline int CeedVectorSyncD2H_Sycl(const CeedVector vec) { if (!data->sycl_queue.is_in_order()) e = {data->sycl_queue.ext_oneapi_submit_barrier()}; CeedCallSycl(ceed, data->sycl_queue.copy(impl->d_array, impl->h_array, length, e).wait_and_throw()); + CeedCallBackend(CeedDestroy(&ceed)); return CEED_ERROR_SUCCESS; } @@ -115,7 +118,9 @@ static int CeedVectorSyncArray_Sycl(const CeedVector vec, CeedMemType mem_type) case CEED_MEM_DEVICE: return CeedVectorSyncH2D_Sycl(vec); } + // LCOV_EXCL_START return CEED_ERROR_UNSUPPORTED; + // LCOV_EXCL_STOP } //------------------------------------------------------------------------------ @@ -244,6 +249,7 @@ static int CeedVectorSetArrayDevice_Sycl(const CeedVector vec, const CeedCopyMod impl->d_array = impl->d_array_borrowed; break; } + CeedCallBackend(CeedDestroy(&ceed)); return CEED_ERROR_SUCCESS; } @@ -263,7 +269,9 @@ static int CeedVectorSetArray_Sycl(const CeedVector vec, const CeedMemType mem_t case CEED_MEM_DEVICE: return CeedVectorSetArrayDevice_Sycl(vec, copy_mode, array); } + // LCOV_EXCL_START return CEED_ERROR_UNSUPPORTED; + // LCOV_EXCL_STOP } //------------------------------------------------------------------------------ @@ -295,9 +303,10 @@ static int CeedVectorSetValue_Sycl(CeedVector vec, CeedScalar val) { CeedVector_Sycl *impl; CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); + CeedCallBackend(CeedGetData(ceed, &data)); + CeedCallBackend(CeedDestroy(&ceed)); CeedCallBackend(CeedVectorGetData(vec, &impl)); CeedCallBackend(CeedVectorGetLength(vec, &length)); - CeedCallBackend(CeedGetData(ceed, &data)); // Set value for synced device/host array if (!impl->d_array && !impl->h_array) { @@ -333,8 +342,10 @@ static int CeedVectorTakeArray_Sycl(CeedVector vec, CeedMemType mem_type, CeedSc CeedVector_Sycl *impl; CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); - CeedCallBackend(CeedVectorGetData(vec, &impl)); CeedCallBackend(CeedGetData(ceed, &data)); + CeedCallBackend(CeedVectorGetData(vec, &impl)); + CeedCallBackend(CeedDestroy(&ceed)); + CeedCallBackend(CeedVectorGetData(vec, &impl)); // Order queue if needed if (!data->sycl_queue.is_in_order()) data->sycl_queue.ext_oneapi_submit_barrier(); @@ -447,9 +458,10 @@ static int CeedVectorNorm_Sycl(CeedVector vec, CeedNormType type, CeedScalar *no CeedVector_Sycl *impl; CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); + CeedCallBackend(CeedGetData(ceed, &data)); + CeedCallBackend(CeedDestroy(&ceed)); CeedCallBackend(CeedVectorGetData(vec, &impl)); CeedCallBackend(CeedVectorGetLength(vec, &length)); - CeedCallBackend(CeedGetData(ceed, &data)); // Compute norm CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &d_array)); @@ -515,9 +527,10 @@ static int CeedVectorReciprocal_Sycl(CeedVector vec) { CeedVector_Sycl *impl; CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); + CeedCallBackend(CeedGetData(ceed, &data)); + CeedCallBackend(CeedDestroy(&ceed)); CeedCallBackend(CeedVectorGetData(vec, &impl)); CeedCallBackend(CeedVectorGetLength(vec, &length)); - CeedCallBackend(CeedGetData(ceed, &data)); // Set value for synced device/host array if (impl->d_array) CeedCallBackend(CeedDeviceReciprocal_Sycl(data->sycl_queue, impl->d_array, length)); @@ -554,9 +567,10 @@ static int CeedVectorScale_Sycl(CeedVector x, CeedScalar alpha) { CeedVector_Sycl *x_impl; CeedCallBackend(CeedVectorGetCeed(x, &ceed)); + CeedCallBackend(CeedGetData(ceed, &data)); + CeedCallBackend(CeedDestroy(&ceed)); CeedCallBackend(CeedVectorGetData(x, &x_impl)); CeedCallBackend(CeedVectorGetLength(x, &length)); - CeedCallBackend(CeedGetData(ceed, &data)); // Set value for synced device/host array if (x_impl->d_array) CeedCallBackend(CeedDeviceScale_Sycl(data->sycl_queue, x_impl->d_array, alpha, length)); @@ -593,10 +607,11 @@ static int CeedVectorAXPY_Sycl(CeedVector y, CeedScalar alpha, CeedVector x) { CeedVector_Sycl *y_impl, *x_impl; CeedCallBackend(CeedVectorGetCeed(y, &ceed)); + CeedCallBackend(CeedGetData(ceed, &data)); + CeedCallBackend(CeedDestroy(&ceed)); CeedCallBackend(CeedVectorGetData(y, &y_impl)); CeedCallBackend(CeedVectorGetData(x, &x_impl)); CeedCallBackend(CeedVectorGetLength(y, &length)); - CeedCallBackend(CeedGetData(ceed, &data)); // Set value for synced device/host array if (y_impl->d_array) { @@ -639,11 +654,12 @@ static int CeedVectorPointwiseMult_Sycl(CeedVector w, CeedVector x, CeedVector y CeedVector_Sycl *w_impl, *x_impl, *y_impl; CeedCallBackend(CeedVectorGetCeed(w, &ceed)); + CeedCallBackend(CeedGetData(ceed, &data)); + CeedCallBackend(CeedDestroy(&ceed)); CeedCallBackend(CeedVectorGetData(w, &w_impl)); CeedCallBackend(CeedVectorGetData(x, &x_impl)); CeedCallBackend(CeedVectorGetData(y, &y_impl)); CeedCallBackend(CeedVectorGetLength(w, &length)); - CeedCallBackend(CeedGetData(ceed, &data)); // Set value for synced device/host array if (!w_impl->d_array && !w_impl->h_array) { @@ -681,6 +697,7 @@ static int CeedVectorDestroy_Sycl(const CeedVector vec) { CeedCallBackend(CeedFree(&impl->h_array_owned)); CeedCallBackend(CeedFree(&impl)); + CeedCallBackend(CeedDestroy(&ceed)); return CEED_ERROR_SUCCESS; } @@ -711,6 +728,7 @@ int CeedVectorCreate_Sycl(CeedSize n, CeedVector vec) { CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Vector", vec, "Scale", CeedVectorScale_Sycl)); CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Vector", vec, "PointwiseMult", CeedVectorPointwiseMult_Sycl)); CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Vector", vec, "Destroy", CeedVectorDestroy_Sycl)); + CeedCallBackend(CeedDestroy(&ceed)); CeedCallBackend(CeedVectorSetData(vec, impl)); return CEED_ERROR_SUCCESS; } diff --git a/backends/sycl-ref/kernels/sycl-ref-vector.cpp b/backends/sycl-ref/kernels/sycl-ref-vector.cpp index 788b608f3a..11db777dce 100644 --- a/backends/sycl-ref/kernels/sycl-ref-vector.cpp +++ b/backends/sycl-ref/kernels/sycl-ref-vector.cpp @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other // CEED contributors. All Rights Reserved. See the top-level LICENSE and NOTICE // files for details. // diff --git a/backends/sycl-shared/ceed-sycl-shared-basis.sycl.cpp b/backends/sycl-shared/ceed-sycl-shared-basis.sycl.cpp index 27ca11b6e5..162b2acb3c 100644 --- a/backends/sycl-shared/ceed-sycl-shared-basis.sycl.cpp +++ b/backends/sycl-shared/ceed-sycl-shared-basis.sycl.cpp @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -106,6 +106,7 @@ int CeedBasisApplyTensor_Sycl_shared(CeedBasis basis, const CeedInt num_elem, Ce //----------- std::vector e; + CeedCheck(impl->d_q_weight_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weight_1d not set", CeedEvalModes[eval_mode]); if (!ceed_Sycl->sycl_queue.is_in_order()) e = {ceed_Sycl->sycl_queue.ext_oneapi_submit_barrier()}; ceed_Sycl->sycl_queue.submit([&](sycl::handler &cgh) { @@ -127,7 +128,7 @@ int CeedBasisApplyTensor_Sycl_shared(CeedBasis basis, const CeedInt num_elem, Ce CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u)); if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); - + CeedCallBackend(CeedDestroy(&ceed)); return CEED_ERROR_SUCCESS; } @@ -143,7 +144,7 @@ static int CeedBasisDestroy_Sycl_shared(CeedBasis basis) { CeedCallBackend(CeedBasisGetData(basis, &impl)); CeedCallBackend(CeedGetData(ceed, &data)); CeedCallSycl(ceed, data->sycl_queue.wait_and_throw()); - CeedCallSycl(ceed, sycl::free(impl->d_q_weight_1d, data->sycl_context)); + if (impl->d_q_weight_1d) CeedCallSycl(ceed, sycl::free(impl->d_q_weight_1d, data->sycl_context)); CeedCallSycl(ceed, sycl::free(impl->d_interp_1d, data->sycl_context)); CeedCallSycl(ceed, sycl::free(impl->d_grad_1d, data->sycl_context)); CeedCallSycl(ceed, sycl::free(impl->d_collo_grad_1d, data->sycl_context)); @@ -156,6 +157,7 @@ static int CeedBasisDestroy_Sycl_shared(CeedBasis basis) { delete impl->sycl_module; CeedCallBackend(CeedFree(&impl)); + CeedCallBackend(CeedDestroy(&ceed)); return CEED_ERROR_SUCCESS; } @@ -198,17 +200,23 @@ int CeedBasisCreateTensorH1_Sycl_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, if (!data->sycl_queue.is_in_order()) e = {data->sycl_queue.ext_oneapi_submit_barrier()}; // Copy basis data to GPU - CeedCallSycl(ceed, impl->d_q_weight_1d = sycl::malloc_device(Q_1d, data->sycl_device, data->sycl_context)); - sycl::event copy_weight = data->sycl_queue.copy(q_weight_1d, impl->d_q_weight_1d, Q_1d, e); + std::vector copy_events; + if (q_weight_1d) { + CeedCallSycl(ceed, impl->d_q_weight_1d = sycl::malloc_device(Q_1d, data->sycl_device, data->sycl_context)); + sycl::event copy_weight = data->sycl_queue.copy(q_weight_1d, impl->d_q_weight_1d, Q_1d, e); + copy_events.push_back(copy_weight); + } const CeedInt interp_length = Q_1d * P_1d; CeedCallSycl(ceed, impl->d_interp_1d = sycl::malloc_device(interp_length, data->sycl_device, data->sycl_context)); sycl::event copy_interp = data->sycl_queue.copy(interp_1d, impl->d_interp_1d, interp_length, e); + copy_events.push_back(copy_interp); CeedCallSycl(ceed, impl->d_grad_1d = sycl::malloc_device(interp_length, data->sycl_device, data->sycl_context)); sycl::event copy_grad = data->sycl_queue.copy(grad_1d, impl->d_grad_1d, interp_length, e); + copy_events.push_back(copy_grad); - CeedCallSycl(ceed, sycl::event::wait_and_throw({copy_weight, copy_interp, copy_grad})); + CeedCallSycl(ceed, sycl::event::wait_and_throw(copy_events)); // Compute collocated gradient and copy to GPU impl->d_collo_grad_1d = NULL; @@ -270,6 +278,7 @@ int CeedBasisCreateTensorH1_Sycl_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, // Register backend functions CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Basis", basis, "Apply", CeedBasisApplyTensor_Sycl_shared)); CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Sycl_shared)); + CeedCallBackend(CeedDestroy(&ceed)); return CEED_ERROR_SUCCESS; } diff --git a/backends/sycl-shared/ceed-sycl-shared.hpp b/backends/sycl-shared/ceed-sycl-shared.hpp index e4a4c9f203..2e2c3df1ca 100644 --- a/backends/sycl-shared/ceed-sycl-shared.hpp +++ b/backends/sycl-shared/ceed-sycl-shared.hpp @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause diff --git a/backends/sycl-shared/ceed-sycl-shared.sycl.cpp b/backends/sycl-shared/ceed-sycl-shared.sycl.cpp index d629e76f95..a563a73626 100644 --- a/backends/sycl-shared/ceed-sycl-shared.sycl.cpp +++ b/backends/sycl-shared/ceed-sycl-shared.sycl.cpp @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -19,7 +19,7 @@ //------------------------------------------------------------------------------ static int CeedInit_Sycl_shared(const char *resource, Ceed ceed) { Ceed ceed_ref; - Ceed_Sycl *data, *ref_data; + Ceed_Sycl *data; char *resource_root; CeedCallBackend(CeedGetResourceRoot(ceed, resource, ":", &resource_root)); diff --git a/backends/sycl/ceed-sycl-common.hpp b/backends/sycl/ceed-sycl-common.hpp index e61cbebc18..f087f8c29a 100644 --- a/backends/sycl/ceed-sycl-common.hpp +++ b/backends/sycl/ceed-sycl-common.hpp @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause diff --git a/backends/sycl/ceed-sycl-common.sycl.cpp b/backends/sycl/ceed-sycl-common.sycl.cpp index e51405d7fa..aa09b693df 100644 --- a/backends/sycl/ceed-sycl-common.sycl.cpp +++ b/backends/sycl/ceed-sycl-common.sycl.cpp @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other // CEED contributors. All Rights Reserved. See the top-level LICENSE and NOTICE // files for details. // @@ -8,6 +8,7 @@ #include "ceed-sycl-common.hpp" +#include #include #include @@ -107,12 +108,14 @@ int CeedSetStream_Sycl(Ceed ceed, void *handle) { if (ceed_delegate) { CeedCallBackend(CeedSetStream_Sycl(ceed_delegate, handle)); } + CeedCallBackend(CeedDestroy(&ceed_delegate)); // Set queue and context for Ceed Fallback object - CeedGetOperatorFallbackCeed(ceed, &ceed_fallback); + CeedCallBackend(CeedGetOperatorFallbackCeed(ceed, &ceed_fallback)); if (ceed_fallback) { CeedCallBackend(CeedSetStream_Sycl(ceed_fallback, handle)); } + CeedCallBackend(CeedDestroy(&ceed_fallback)); return CEED_ERROR_SUCCESS; } diff --git a/backends/sycl/ceed-sycl-compile.hpp b/backends/sycl/ceed-sycl-compile.hpp index 67db04f294..1baa1f3ca4 100644 --- a/backends/sycl/ceed-sycl-compile.hpp +++ b/backends/sycl/ceed-sycl-compile.hpp @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause diff --git a/backends/sycl/ceed-sycl-compile.sycl.cpp b/backends/sycl/ceed-sycl-compile.sycl.cpp index 9615114158..f939ca940f 100644 --- a/backends/sycl/ceed-sycl-compile.sycl.cpp +++ b/backends/sycl/ceed-sycl-compile.sycl.cpp @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -61,7 +61,7 @@ static int CeedJitAddDefinitions_Sycl(Ceed ceed, const std::string &kernel_sourc // TODO: Add architecture flags, optimization flags //------------------------------------------------------------------------------ static inline int CeedJitGetFlags_Sycl(std::vector &flags) { - flags = {std::string("-cl-std=CL3.0"), std::string("-Dint32_t=int")}; + flags = {std::string("-cl-std=CL3.0"), std::string("-Dint32_t=int"), std::string("-DCEED_RUNNING_JIT_PASS=1")}; return CEED_ERROR_SUCCESS; } @@ -106,7 +106,7 @@ static int CeedLoadModule_Sycl(Ceed ceed, const sycl::context &sycl_context, con zeModuleBuildLogGetString(lz_log, &log_size, nullptr); - CeedCall(CeedCalloc(log_size, &log_message)); + CeedCallBackend(CeedCalloc(log_size, &log_message)); zeModuleBuildLogGetString(lz_log, &log_size, log_message); return CeedError(ceed, CEED_ERROR_BACKEND, "Failed to compile Level Zero module:\n%s", log_message); @@ -157,8 +157,9 @@ int CeedGetKernel_Sycl(Ceed ceed, const SyclModule_t *sycl_module, const std::st return CeedError(ceed, CEED_ERROR_BACKEND, "Failed to retrieve kernel from Level Zero module"); } - *sycl_kernel = new sycl::kernel(sycl::make_kernel( - {*sycl_module, lz_kernel, sycl::ext::oneapi::level_zero::ownership::transfer}, data->sycl_context)); + *sycl_kernel = new sycl::kernel(sycl::make_kernel({*sycl_module, lz_kernel, + sycl::ext::oneapi::level_zero::ownership::transfer}, + data->sycl_context)); return CEED_ERROR_SUCCESS; } diff --git a/backends/sycl/online_compiler.hpp b/backends/sycl/online_compiler.hpp index f9fbf529fa..74d2577bc3 100644 --- a/backends/sycl/online_compiler.hpp +++ b/backends/sycl/online_compiler.hpp @@ -63,7 +63,7 @@ class device_arch { class online_compile_error : public sycl::exception { public: online_compile_error() = default; - online_compile_error(const std::string &Msg) : sycl::exception(Msg) {} + online_compile_error(const std::string &Msg) : sycl::exception(make_error_code(errc::invalid), Msg) {} }; /// Designates a source language for the online compiler. diff --git a/backends/weak/ceed-avx-weak.c b/backends/weak/ceed-avx-weak.c new file mode 100644 index 0000000000..639c08f63b --- /dev/null +++ b/backends/weak/ceed-avx-weak.c @@ -0,0 +1,12 @@ +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. +// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +// +// SPDX-License-Identifier: BSD-2-Clause +// +// This file is part of CEED: http://github.com/ceed + +#include "ceed-backend-weak.h" +// LCOV_EXCL_START +#include "../ceed-backend-list-avx.h" +// LCOV_EXCL_STOP +#undef CEED_BACKEND diff --git a/backends/ceed-backend-weak.c b/backends/weak/ceed-backend-weak.c similarity index 74% rename from backends/ceed-backend-weak.c rename to backends/weak/ceed-backend-weak.c index e4c401f6a9..1ae70f81a6 100644 --- a/backends/ceed-backend-weak.c +++ b/backends/weak/ceed-backend-weak.c @@ -1,10 +1,11 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause // // This file is part of CEED: http://github.com/ceed +#include "ceed-backend-weak.h" #include #include #include @@ -17,7 +18,7 @@ static int CeedInit_Weak(const char *resource, Ceed ceed) { } // This function provides a debug target for weak symbols -static int CeedRegister_Weak(const char *name, int num_prefixes, ...) { +int CeedRegister_Weak(const char *name, int num_prefixes, ...) { va_list prefixes; int ierr; @@ -36,9 +37,3 @@ static int CeedRegister_Weak(const char *name, int num_prefixes, ...) { return CEED_ERROR_SUCCESS; } // LCOV_EXCL_STOP - -#define CEED_BACKEND(name, num_prefixes, ...) \ - CEED_INTERN int name(void) __attribute__((weak)); \ - int name(void) { return CeedRegister_Weak(__func__, num_prefixes, __VA_ARGS__); } -#include "ceed-backend-list.h" -#undef CEED_BACKEND diff --git a/backends/weak/ceed-backend-weak.h b/backends/weak/ceed-backend-weak.h new file mode 100644 index 0000000000..b828c44394 --- /dev/null +++ b/backends/weak/ceed-backend-weak.h @@ -0,0 +1,15 @@ +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. +// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +// +// SPDX-License-Identifier: BSD-2-Clause +// +// This file is part of CEED: http://github.com/ceed + +#include +#include +#include + +CEED_INTERN int CeedRegister_Weak(const char *name, int num_prefixes, ...); + +#define CEED_BACKEND(name, num_prefixes, ...) \ + CEED_INTERN int __attribute__((weak)) name(void) { return CeedRegister_Weak(__func__, num_prefixes, __VA_ARGS__); } diff --git a/backends/weak/ceed-cuda-weak.c b/backends/weak/ceed-cuda-weak.c new file mode 100644 index 0000000000..8bc81c78f5 --- /dev/null +++ b/backends/weak/ceed-cuda-weak.c @@ -0,0 +1,12 @@ +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. +// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +// +// SPDX-License-Identifier: BSD-2-Clause +// +// This file is part of CEED: http://github.com/ceed + +#include "ceed-backend-weak.h" +// LCOV_EXCL_START +#include "../ceed-backend-list-cuda.h" +// LCOV_EXCL_STOP +#undef CEED_BACKEND diff --git a/backends/weak/ceed-hip-weak.c b/backends/weak/ceed-hip-weak.c new file mode 100644 index 0000000000..ec90d3bdee --- /dev/null +++ b/backends/weak/ceed-hip-weak.c @@ -0,0 +1,12 @@ +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. +// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +// +// SPDX-License-Identifier: BSD-2-Clause +// +// This file is part of CEED: http://github.com/ceed + +#include "ceed-backend-weak.h" +// LCOV_EXCL_START +#include "../ceed-backend-list-hip.h" +// LCOV_EXCL_STOP +#undef CEED_BACKEND diff --git a/backends/weak/ceed-magma-weak.c b/backends/weak/ceed-magma-weak.c new file mode 100644 index 0000000000..cace059504 --- /dev/null +++ b/backends/weak/ceed-magma-weak.c @@ -0,0 +1,12 @@ +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. +// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +// +// SPDX-License-Identifier: BSD-2-Clause +// +// This file is part of CEED: http://github.com/ceed + +#include "ceed-backend-weak.h" +// LCOV_EXCL_START +#include "../ceed-backend-list-magma.h" +// LCOV_EXCL_STOP +#undef CEED_BACKEND diff --git a/backends/weak/ceed-memcheck-weak.c b/backends/weak/ceed-memcheck-weak.c new file mode 100644 index 0000000000..35fd01613b --- /dev/null +++ b/backends/weak/ceed-memcheck-weak.c @@ -0,0 +1,12 @@ +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. +// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +// +// SPDX-License-Identifier: BSD-2-Clause +// +// This file is part of CEED: http://github.com/ceed + +#include "ceed-backend-weak.h" +// LCOV_EXCL_START +#include "../ceed-backend-list-memcheck.h" +// LCOV_EXCL_STOP +#undef CEED_BACKEND diff --git a/backends/weak/ceed-sycl-weak.c b/backends/weak/ceed-sycl-weak.c new file mode 100644 index 0000000000..92bc508449 --- /dev/null +++ b/backends/weak/ceed-sycl-weak.c @@ -0,0 +1,12 @@ +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. +// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +// +// SPDX-License-Identifier: BSD-2-Clause +// +// This file is part of CEED: http://github.com/ceed + +#include "ceed-backend-weak.h" +// LCOV_EXCL_START +#include "../ceed-backend-list-sycl.h" +// LCOV_EXCL_STOP +#undef CEED_BACKEND diff --git a/backends/weak/ceed-xsmm-weak.c b/backends/weak/ceed-xsmm-weak.c new file mode 100644 index 0000000000..6ae36a2822 --- /dev/null +++ b/backends/weak/ceed-xsmm-weak.c @@ -0,0 +1,12 @@ +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. +// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +// +// SPDX-License-Identifier: BSD-2-Clause +// +// This file is part of CEED: http://github.com/ceed + +#include "ceed-backend-weak.h" +// LCOV_EXCL_START +#include "../ceed-backend-list-xsmm.h" +// LCOV_EXCL_STOP +#undef CEED_BACKEND diff --git a/backends/xsmm/ceed-xsmm-blocked.c b/backends/xsmm/ceed-xsmm-blocked.c index 90dc19e741..2abaa247c1 100644 --- a/backends/xsmm/ceed-xsmm-blocked.c +++ b/backends/xsmm/ceed-xsmm-blocked.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -25,6 +25,7 @@ static int CeedInit_Xsmm_Blocked(const char *resource, Ceed ceed) { // Create reference Ceed that implementation will be dispatched through unless overridden CeedCallBackend(CeedInit("/cpu/self/opt/blocked", &ceed_ref)); CeedCallBackend(CeedSetDelegate(ceed, ceed_ref)); + CeedCallBackend(CeedDestroy(&ceed_ref)); CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "TensorContractCreate", CeedTensorContractCreate_Xsmm)); return CEED_ERROR_SUCCESS; diff --git a/backends/xsmm/ceed-xsmm-serial.c b/backends/xsmm/ceed-xsmm-serial.c index 68e51a63e3..7892e845be 100644 --- a/backends/xsmm/ceed-xsmm-serial.c +++ b/backends/xsmm/ceed-xsmm-serial.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -25,6 +25,7 @@ static int CeedInit_Xsmm_Serial(const char *resource, Ceed ceed) { // Create reference Ceed that implementation will be dispatched through unless overridden CeedCallBackend(CeedInit("/cpu/self/opt/serial", &ceed_ref)); CeedCallBackend(CeedSetDelegate(ceed, ceed_ref)); + CeedCallBackend(CeedDestroy(&ceed_ref)); CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "TensorContractCreate", CeedTensorContractCreate_Xsmm)); return CEED_ERROR_SUCCESS; diff --git a/backends/xsmm/ceed-xsmm-tensor.c b/backends/xsmm/ceed-xsmm-tensor.c index 0d7383bf40..21bf22ef8b 100644 --- a/backends/xsmm/ceed-xsmm-tensor.c +++ b/backends/xsmm/ceed-xsmm-tensor.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -16,10 +16,6 @@ //------------------------------------------------------------------------------ static int CeedTensorContractApply_Xsmm(CeedTensorContract contract, CeedInt A, CeedInt B, CeedInt C, CeedInt J, const CeedScalar *restrict t, CeedTransposeMode t_mode, const CeedInt add, const CeedScalar *restrict u, CeedScalar *restrict v) { - Ceed ceed; - - CeedCallBackend(CeedTensorContractGetCeed(contract, &ceed)); - if (C == 1) { // Build or query the required kernel const int flags_t = LIBXSMM_GEMM_FLAGS(!t_mode ? 'T' : 'N', 'N'); @@ -30,10 +26,10 @@ static int CeedTensorContractApply_Xsmm(CeedTensorContract contract, CeedInt A, LIBXSMM_DATATYPE_F64, LIBXSMM_DATATYPE_F64) : libxsmm_create_gemm_shape(J, A, B, !t_mode ? B : J, B, J, LIBXSMM_DATATYPE_F32, LIBXSMM_DATATYPE_F32, LIBXSMM_DATATYPE_F32, LIBXSMM_DATATYPE_F32); - const libxsmm_gemmfunction kernel = libxsmm_dispatch_gemm_v2(gemm_shape, (libxsmm_bitfield)(flags), (libxsmm_bitfield)LIBXSMM_GEMM_PREFETCH_NONE); + const libxsmm_gemmfunction kernel = libxsmm_dispatch_gemm(gemm_shape, (libxsmm_bitfield)(flags), (libxsmm_bitfield)LIBXSMM_GEMM_PREFETCH_NONE); libxsmm_gemm_param gemm_param; - CeedCheck(kernel, ceed, CEED_ERROR_BACKEND, "LIBXSMM kernel failed to build."); + CeedCheck(kernel, CeedTensorContractReturnCeed(contract), CEED_ERROR_BACKEND, "LIBXSMM kernel failed to build."); // Run kernel gemm_param.a.primary = (CeedScalar *)&t[0]; @@ -50,10 +46,10 @@ static int CeedTensorContractApply_Xsmm(CeedTensorContract contract, CeedInt A, LIBXSMM_DATATYPE_F64, LIBXSMM_DATATYPE_F64) : libxsmm_create_gemm_shape(C, J, B, C, !t_mode ? B : J, C, LIBXSMM_DATATYPE_F32, LIBXSMM_DATATYPE_F32, LIBXSMM_DATATYPE_F32, LIBXSMM_DATATYPE_F32); - const libxsmm_gemmfunction kernel = libxsmm_dispatch_gemm_v2(gemm_shape, (libxsmm_bitfield)(flags), (libxsmm_bitfield)LIBXSMM_GEMM_PREFETCH_NONE); + const libxsmm_gemmfunction kernel = libxsmm_dispatch_gemm(gemm_shape, (libxsmm_bitfield)(flags), (libxsmm_bitfield)LIBXSMM_GEMM_PREFETCH_NONE); libxsmm_gemm_param gemm_param; - CeedCheck(kernel, ceed, CEED_ERROR_BACKEND, "LIBXSMM kernel failed to build."); + CeedCheck(kernel, CeedTensorContractReturnCeed(contract), CEED_ERROR_BACKEND, "LIBXSMM kernel failed to build."); // Run kernel gemm_param.b.primary = (CeedScalar *)&t[0]; diff --git a/backends/xsmm/ceed-xsmm.h b/backends/xsmm/ceed-xsmm.h index 0cb56591fe..124d8d4493 100644 --- a/backends/xsmm/ceed-xsmm.h +++ b/backends/xsmm/ceed-xsmm.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause diff --git a/benchmarks/benchmark.sh b/benchmarks/benchmark.sh index 167e374f7b..59ff3cc0d7 100755 --- a/benchmarks/benchmark.sh +++ b/benchmarks/benchmark.sh @@ -1,6 +1,6 @@ #!/bin/bash -# Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. # All Rights Reserved. See the top-level LICENSE and NOTICE files for details. # # SPDX-License-Identifier: BSD-2-Clause diff --git a/benchmarks/petsc-bps.sh b/benchmarks/petsc-bps.sh index 46ba51b73c..004dc4b5b8 100755 --- a/benchmarks/petsc-bps.sh +++ b/benchmarks/petsc-bps.sh @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. # All Rights Reserved. See the top-level LICENSE and NOTICE files for details. # # SPDX-License-Identifier: BSD-2-Clause diff --git a/benchmarks/petsc-bpsraw.sh b/benchmarks/petsc-bpsraw.sh index 666593c7d3..7099bb4ce1 100755 --- a/benchmarks/petsc-bpsraw.sh +++ b/benchmarks/petsc-bpsraw.sh @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. # All Rights Reserved. See the top-level LICENSE and NOTICE files for details. # # SPDX-License-Identifier: BSD-2-Clause diff --git a/benchmarks/postprocess_base.py b/benchmarks/postprocess_base.py index b9a8d46ddf..f69d283d38 100755 --- a/benchmarks/postprocess_base.py +++ b/benchmarks/postprocess_base.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 -# Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. # All Rights Reserved. See the top-level LICENSE and NOTICE files for details. # # SPDX-License-Identifier: BSD-2-Clause diff --git a/benchmarks/postprocess_plot.py b/benchmarks/postprocess_plot.py index 62939e54d7..59101837f3 100755 --- a/benchmarks/postprocess_plot.py +++ b/benchmarks/postprocess_plot.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 -# Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. # All Rights Reserved. See the top-level LICENSE and NOTICE files for details. # # SPDX-License-Identifier: BSD-2-Clause diff --git a/benchmarks/postprocess_table.py b/benchmarks/postprocess_table.py index 27a200e0b1..8822a346ff 100755 --- a/benchmarks/postprocess_table.py +++ b/benchmarks/postprocess_table.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 -# Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. # All Rights Reserved. See the top-level LICENSE and NOTICE files for details. # # SPDX-License-Identifier: BSD-2-Clause diff --git a/ceed.pc.template b/ceed.pc.template index 56bc5a076f..1d8458a4ee 100644 --- a/ceed.pc.template +++ b/ceed.pc.template @@ -1,6 +1,7 @@ prefix=%prefix% includedir=${prefix}/include libdir=${prefix}/lib +cflags_extra=%opt% Name: CEED Description: Code for Efficient Extensible Discretization diff --git a/common.mk b/common.mk index 4c466b8782..1a53bbf820 100644 --- a/common.mk +++ b/common.mk @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. # All Rights Reserved. See the top-level LICENSE and NOTICE files for details. # # SPDX-License-Identifier: BSD-2-Clause diff --git a/coverage.info b/coverage.info new file mode 100644 index 0000000000..2e177c3e35 --- /dev/null +++ b/coverage.info @@ -0,0 +1,12848 @@ +TN: +SF:/home/jeremy/Dev/libCEED/backends/blocked/ceed-blocked-operator.c +FNL:0,19,197 +FNA:0,192,CeedOperatorSetupFields_Blocked +FNL:1,202,266 +FNA:1,96,CeedOperatorSetup_Blocked +FNL:2,271,303 +FNA:2,96,CeedOperatorSetupInputs_Blocked +FNL:3,308,354 +FNA:3,192,CeedOperatorInputBasis_Blocked +FNL:4,359,400 +FNA:4,192,CeedOperatorOutputBasis_Blocked +FNL:5,405,427 +FNA:5,96,CeedOperatorRestoreInputs_Blocked +FNL:6,432,520 +FNA:6,96,CeedOperatorApplyAdd_Blocked +FNL:7,525,719 +FNA:7,0,CeedOperatorLinearAssembleQFunctionCore_Blocked +FNL:8,724,726 +FNA:8,0,CeedOperatorLinearAssembleQFunction_Blocked +FNL:9,731,733 +FNA:9,0,CeedOperatorLinearAssembleQFunctionUpdate_Blocked +FNL:10,738,775 +FNA:10,96,CeedOperatorDestroy_Blocked +FNL:11,780,793 +FNA:11,96,CeedOperatorCreate_Blocked +FNF:12 +FNH:9 +DA:19,192 +DA:31,192 +DA:32,192 +DA:33,192 +DA:34,192 +DA:36,192 +DA:37,96 +DA:38,96 +DA:40,96 +DA:41,96 +DA:45,480 +DA:49,288 +DA:50,288 +DA:57,240 +DA:58,240 +DA:59,240 +DA:60,240 +DA:61,240 +DA:62,240 +DA:63,240 +DA:65,240 +DA:66,240 +DA:67,144 +DA:68,144 +DA:70,144 +DA:71,144 +DA:73,144 +DA:74,144 +DA:75,0 +DA:76,0 +DA:77,0 +DA:79,0 +DA:80,0 +DA:81,0 +DA:83,0 +DA:84,0 +DA:85,0 +DA:86,0 +DA:87,0 +DA:88,0 +DA:90,0 +DA:91,0 +DA:92,0 +DA:95,0 +DA:96,0 +DA:97,0 +DA:98,96 +DA:101,96 +DA:102,96 +DA:104,96 +DA:105,0 +DA:107,0 +DA:109,240 +DA:110,240 +DA:111,240 +DA:114,288 +DA:115,96 +DA:116,96 +DA:117,96 +DA:118,96 +DA:119,96 +DA:120,144 +DA:124,144 +DA:125,144 +DA:126,144 +DA:127,144 +DA:128,144 +DA:129,144 +DA:130,144 +DA:131,144 +DA:132,144 +DA:133,144 +DA:134,48 +DA:135,48 +DA:136,48 +DA:137,48 +DA:138,48 +DA:139,48 +DA:140,48 +DA:144,192 +DA:145,288 +DA:149,192 +DA:150,192 +DA:151,288 +DA:155,96 +DA:156,96 +DA:157,96 +DA:158,0 +DA:159,0 +DA:160,0 +DA:162,96 +DA:163,96 +DA:165,192 +DA:166,192 +DA:169,192 +DA:173,96 +DA:174,96 +DA:175,96 +DA:179,0 +DA:180,0 +DA:181,0 +DA:182,0 +DA:183,0 +DA:184,0 +DA:185,0 +DA:186,0 +DA:188,0 +DA:189,0 +DA:191,96 +DA:192,96 +DA:195,192 +DA:196,192 +DA:202,96 +DA:205,96 +DA:211,96 +DA:212,96 +DA:214,96 +DA:215,96 +DA:216,96 +DA:217,96 +DA:218,96 +DA:219,96 +DA:222,96 +DA:223,96 +DA:225,96 +DA:226,96 +DA:227,96 +DA:228,96 +DA:229,96 +DA:230,96 +DA:231,96 +DA:232,96 +DA:233,96 +DA:235,96 +DA:236,96 +DA:240,96 +DA:243,96 +DA:248,96 +DA:252,0 +DA:253,0 +DA:254,0 +DA:256,0 +DA:257,0 +DA:259,0 +DA:263,96 +DA:264,96 +DA:265,96 +DA:271,96 +DA:274,288 +DA:281,192 +DA:282,192 +DA:283,192 +DA:284,96 +DA:285,96 +DA:288,192 +DA:289,192 +DA:292,144 +DA:293,144 +DA:294,144 +DA:296,144 +DA:298,144 +DA:300,192 +DA:302,96 +DA:308,192 +DA:311,576 +DA:318,384 +DA:322,0 +DA:323,0 +DA:324,0 +DA:325,0 +DA:329,384 +DA:330,384 +DA:331,384 +DA:332,384 +DA:333,384 +DA:335,384 +DA:336,96 +DA:337,96 +DA:338,96 +DA:339,192 +DA:343,192 +DA:344,192 +DA:345,192 +DA:346,192 +DA:347,192 +DA:348,192 +DA:349,96 +DA:350,96 +DA:353,192 +DA:359,192 +DA:362,384 +DA:369,192 +DA:370,192 +DA:371,192 +DA:372,192 +DA:374,192 +DA:375,96 +DA:376,96 +DA:377,96 +DA:381,96 +DA:382,96 +DA:383,96 +DA:385,96 +DA:386,0 +DA:388,96 +DA:390,96 +DA:391,96 +DA:399,192 +DA:405,96 +DA:407,288 +DA:411,192 +DA:415,0 +DA:416,0 +DA:417,0 +DA:418,0 +DA:420,192 +DA:421,192 +DA:423,144 +DA:426,96 +DA:432,96 +DA:434,96 +DA:436,96 +DA:443,96 +DA:445,96 +DA:448,96 +DA:449,0 +DA:450,0 +DA:451,0 +DA:453,96 +DA:454,96 +DA:455,96 +DA:456,96 +DA:457,96 +DA:458,96 +DA:461,96 +DA:464,192 +DA:465,96 +DA:466,0 +DA:468,96 +DA:473,288 +DA:475,384 +DA:476,192 +DA:477,192 +DA:478,96 +DA:479,96 +DA:485,192 +DA:488,192 +DA:489,192 +DA:493,192 +DA:498,192 +DA:502,96 +DA:504,96 +DA:506,96 +DA:507,96 +DA:509,96 +DA:511,96 +DA:513,96 +DA:517,96 +DA:518,96 +DA:519,96 +DA:525,0 +DA:529,0 +DA:531,0 +DA:537,0 +DA:538,0 +DA:539,0 +DA:540,0 +DA:541,0 +DA:543,0 +DA:544,0 +DA:545,0 +DA:546,0 +DA:547,0 +DA:548,0 +DA:549,0 +DA:552,0 +DA:555,0 +DA:558,0 +DA:561,0 +DA:562,0 +DA:567,0 +DA:568,0 +DA:569,0 +DA:570,0 +DA:571,0 +DA:573,0 +DA:575,0 +DA:576,0 +DA:580,0 +DA:581,0 +DA:586,0 +DA:587,0 +DA:588,0 +DA:589,0 +DA:591,0 +DA:593,0 +DA:594,0 +DA:598,0 +DA:599,0 +DA:601,0 +DA:602,0 +DA:604,0 +DA:607,0 +DA:608,0 +DA:610,0 +DA:612,0 +DA:616,0 +DA:617,0 +DA:618,0 +DA:621,0 +DA:624,0 +DA:628,0 +DA:630,0 +DA:633,0 +DA:639,0 +DA:640,0 +DA:641,0 +DA:642,0 +DA:643,0 +DA:644,0 +DA:649,0 +DA:650,0 +DA:651,0 +DA:654,0 +DA:656,0 +DA:661,0 +DA:663,0 +DA:664,0 +DA:665,0 +DA:666,0 +DA:668,0 +DA:671,0 +DA:677,0 +DA:678,0 +DA:679,0 +DA:680,0 +DA:681,0 +DA:687,0 +DA:688,0 +DA:689,0 +DA:696,0 +DA:697,0 +DA:701,0 +DA:702,0 +DA:703,0 +DA:705,0 +DA:710,0 +DA:713,0 +DA:714,0 +DA:715,0 +DA:716,0 +DA:717,0 +DA:718,0 +DA:724,0 +DA:725,0 +DA:731,0 +DA:732,0 +DA:738,96 +DA:741,96 +DA:743,96 +DA:744,96 +DA:745,96 +DA:746,96 +DA:747,384 +DA:748,288 +DA:749,288 +DA:751,96 +DA:752,96 +DA:753,96 +DA:755,288 +DA:756,192 +DA:757,192 +DA:759,96 +DA:760,96 +DA:762,192 +DA:763,96 +DA:764,96 +DA:766,96 +DA:767,96 +DA:770,96 +DA:771,96 +DA:773,96 +DA:774,96 +DA:780,96 +DA:784,96 +DA:785,96 +DA:786,96 +DA:787,96 +DA:788,96 +DA:789,96 +DA:790,96 +DA:791,96 +DA:792,96 +LF:393 +LH:244 +end_of_record +TN: +SF:/home/jeremy/Dev/libCEED/backends/blocked/ceed-blocked.c +FNL:0,18,32 +FNA:0,48,CeedInit_Blocked +FNL:1,37,37 +FNA:1,192,CeedRegister_Ref_Blocked +FNF:2 +FNH:2 +DA:18,48 +DA:21,48 +DA:23,48 +DA:26,48 +DA:27,48 +DA:28,48 +DA:30,48 +DA:31,48 +DA:37,192 +LF:9 +LH:9 +end_of_record +TN: +SF:/home/jeremy/Dev/libCEED/backends/ceed-backend-list-avx.h +FNL:0,12,12 +FNA:0,192,CeedRegister_Avx_Blocked +FNL:1,13,13 +FNA:1,192,CeedRegister_Avx_Serial +FNF:2 +FNH:2 +DA:12,384 +DA:13,384 +LF:2 +LH:2 +end_of_record +TN: +SF:/home/jeremy/Dev/libCEED/backends/ceed-backend-list-cuda.h +FNL:0,12,12 +FNA:0,192,CeedRegister_Cuda +FNL:1,13,13 +FNA:1,192,CeedRegister_Cuda_Gen +FNL:2,14,14 +FNA:2,192,CeedRegister_Cuda_Shared +FNF:3 +FNH:3 +DA:12,384 +DA:13,384 +DA:14,384 +LF:3 +LH:3 +end_of_record +TN: +SF:/home/jeremy/Dev/libCEED/backends/ceed-backend-list-hip.h +FNL:0,12,12 +FNA:0,192,CeedRegister_Hip +FNL:1,13,13 +FNA:1,192,CeedRegister_Hip_Gen +FNL:2,14,14 +FNA:2,192,CeedRegister_Hip_Shared +FNF:3 +FNH:3 +DA:12,384 +DA:13,384 +DA:14,384 +LF:3 +LH:3 +end_of_record +TN: +SF:/home/jeremy/Dev/libCEED/backends/ceed-backend-list-magma.h +FNL:0,12,12 +FNA:0,192,CeedRegister_Magma +FNL:1,13,13 +FNA:1,192,CeedRegister_Magma_Det +FNF:2 +FNH:2 +DA:12,384 +DA:13,384 +LF:2 +LH:2 +end_of_record +TN: +SF:/home/jeremy/Dev/libCEED/backends/ceed-backend-list-memcheck.h +FNL:0,12,12 +FNA:0,0,CeedRegister_Memcheck_Blocked +FNL:1,13,13 +FNA:1,0,CeedRegister_Memcheck_Serial +FNF:2 +FNH:0 +DA:12,192 +DA:13,192 +LF:2 +LH:2 +end_of_record +TN: +SF:/home/jeremy/Dev/libCEED/backends/ceed-backend-list-ref.h +FNF:0 +FNH:0 +DA:12,192 +DA:13,192 +DA:14,192 +DA:15,192 +LF:4 +LH:4 +end_of_record +TN: +SF:/home/jeremy/Dev/libCEED/backends/ceed-backend-list-sycl.h +FNL:0,12,12 +FNA:0,192,CeedRegister_Sycl +FNL:1,13,13 +FNA:1,192,CeedRegister_Sycl_Shared +FNL:2,14,14 +FNA:2,192,CeedRegister_Sycl_Gen +FNF:3 +FNH:3 +DA:12,384 +DA:13,384 +DA:14,384 +LF:3 +LH:3 +end_of_record +TN: +SF:/home/jeremy/Dev/libCEED/backends/ceed-backend-list-xsmm.h +FNL:0,12,12 +FNA:0,0,CeedRegister_Xsmm_Blocked +FNL:1,13,13 +FNA:1,0,CeedRegister_Xsmm_Serial +FNF:2 +FNH:0 +DA:12,192 +DA:13,192 +LF:2 +LH:2 +end_of_record +TN: +SF:/home/jeremy/Dev/libCEED/backends/memcheck/ceed-memcheck-blocked.c +FNL:0,17,34 +FNA:0,24,CeedInit_Memcheck +FNL:1,39,39 +FNA:1,192,CeedRegister_Memcheck_Blocked +FNF:2 +FNH:2 +DA:17,24 +DA:20,24 +DA:23,24 +DA:24,24 +DA:25,24 +DA:27,24 +DA:28,24 +DA:29,24 +DA:30,24 +DA:31,24 +DA:32,24 +DA:33,24 +DA:39,192 +LF:13 +LH:13 +end_of_record +TN: +SF:/home/jeremy/Dev/libCEED/backends/memcheck/ceed-memcheck-qfunction.c +FNL:0,111,124 +FNA:0,96,CeedQFunctionCreate_Memcheck +FNL:1,19,93 +FNA:1,768,CeedQFunctionApply_Memcheck +FNL:2,98,106 +FNA:2,96,CeedQFunctionDestroy_Memcheck +FNF:3 +FNH:3 +DA:19,768 +DA:20,768 +DA:23,768 +DA:27,768 +DA:28,768 +DA:29,768 +DA:30,768 +DA:33,2304 +DA:35,1536 +DA:37,1536 +DA:39,1536 +DA:41,1536 +DA:42,1536 +DA:46,1536 +DA:48,768 +DA:50,768 +DA:52,768 +DA:53,768 +DA:55,768 +DA:56,768 +DA:60,768 +DA:63,2304 +DA:64,1536 +DA:65,1536 +DA:72,768 +DA:73,768 +DA:74,768 +DA:75,1536 +DA:80,768 +DA:81,768 +DA:82,50688 +DA:83,49920 +DA:87,768 +DA:88,768 +DA:91,768 +DA:92,768 +DA:98,96 +DA:101,96 +DA:102,96 +DA:103,96 +DA:104,96 +DA:105,96 +DA:111,96 +DA:115,96 +DA:116,96 +DA:117,96 +DA:118,96 +DA:119,96 +DA:120,96 +DA:121,96 +DA:122,96 +DA:123,96 +LF:52 +LH:52 +end_of_record +TN: +SF:/home/jeremy/Dev/libCEED/backends/memcheck/ceed-memcheck-qfunctioncontext.c +FNL:0,116,140 +FNA:0,0,CeedQFunctionContextTakeData_Memcheck +FNL:1,145,160 +FNA:1,396,CeedQFunctionContextGetData_Memcheck +FNL:2,165,182 +FNA:2,0,CeedQFunctionContextGetDataRead_Memcheck +FNL:3,187,203 +FNA:3,396,CeedQFunctionContextRestoreData_Memcheck +FNL:4,19,25 +FNA:4,396,CeedQFunctionContextHasValidData_Memcheck +FNL:5,208,225 +FNA:5,0,CeedQFunctionContextRestoreDataRead_Memcheck +FNL:6,230,262 +FNA:6,144,CeedQFunctionContextDataDestroy_Memcheck +FNL:7,267,285 +FNA:7,72,CeedQFunctionContextDestroy_Memcheck +FNL:8,290,309 +FNA:8,72,CeedQFunctionContextCreate_Memcheck +FNL:9,30,38 +FNA:9,0,CeedQFunctionContextHasBorrowedDataOfType_Memcheck +FNL:10,43,89 +FNA:10,72,CeedQFunctionContextSetData_Memcheck +FNL:11,94,111 +FNA:11,396,CeedQFunctionContextSyncData_Memcheck +FNF:12 +FNH:8 +DA:19,396 +DA:22,396 +DA:23,396 +DA:24,396 +DA:30,0 +DA:33,0 +DA:35,0 +DA:36,0 +DA:37,0 +DA:43,72 +DA:47,72 +DA:49,72 +DA:50,72 +DA:53,72 +DA:54,0 +DA:55,0 +DA:57,72 +DA:58,72 +DA:59,0 +DA:60,0 +DA:62,72 +DA:65,72 +DA:68,72 +DA:69,0 +DA:70,0 +DA:71,0 +DA:72,0 +DA:73,24 +DA:74,24 +DA:75,24 +DA:76,24 +DA:77,24 +DA:78,48 +DA:79,48 +DA:80,48 +DA:81,48 +DA:85,72 +DA:86,72 +DA:87,72 +DA:88,72 +DA:94,396 +DA:98,396 +DA:100,396 +DA:101,396 +DA:104,396 +DA:105,204 +DA:107,396 +DA:108,192 +DA:110,396 +DA:116,0 +DA:120,0 +DA:122,0 +DA:123,0 +DA:126,0 +DA:129,0 +DA:130,0 +DA:131,0 +DA:134,0 +DA:135,0 +DA:136,0 +DA:138,0 +DA:139,0 +DA:145,396 +DA:149,396 +DA:151,396 +DA:152,396 +DA:155,396 +DA:156,396 +DA:157,396 +DA:158,396 +DA:159,396 +DA:165,0 +DA:169,0 +DA:171,0 +DA:172,0 +DA:175,0 +DA:176,0 +DA:177,0 +DA:178,0 +DA:180,0 +DA:181,0 +DA:187,396 +DA:191,396 +DA:192,396 +DA:195,396 +DA:196,396 +DA:199,396 +DA:200,396 +DA:201,396 +DA:202,396 +DA:208,0 +DA:212,0 +DA:213,0 +DA:216,0 +DA:218,0 +DA:221,0 +DA:222,0 +DA:223,0 +DA:224,0 +DA:230,144 +DA:235,144 +DA:237,144 +DA:238,144 +DA:242,144 +DA:243,0 +DA:245,0 +DA:246,0 +DA:247,0 +DA:250,144 +DA:251,72 +DA:252,72 +DA:254,144 +DA:255,24 +DA:256,24 +DA:258,144 +DA:259,48 +DA:261,144 +DA:267,72 +DA:271,72 +DA:272,72 +DA:273,0 +DA:274,0 +DA:276,72 +DA:277,0 +DA:278,0 +DA:280,72 +DA:281,48 +DA:283,72 +DA:284,72 +DA:290,72 +DA:294,72 +DA:295,72 +DA:296,72 +DA:297,72 +DA:298,72 +DA:299,72 +DA:300,72 +DA:301,72 +DA:302,72 +DA:303,72 +DA:304,72 +DA:305,72 +DA:306,72 +DA:307,72 +DA:308,72 +LF:145 +LH:92 +end_of_record +TN: +SF:/home/jeremy/Dev/libCEED/backends/memcheck/ceed-memcheck-restriction.c +FNL:0,109,149 +FNA:0,0,CeedElemRestrictionApplyCurlOrientedNoTranspose_Memcheck_Core +FNL:1,151,190 +FNA:1,0,CeedElemRestrictionApplyCurlOrientedUnsignedNoTranspose_Memcheck_Core +FNL:2,19,41 +FNA:2,192,CeedElemRestrictionGetBackendStrides_Memcheck +FNL:3,192,216 +FNA:3,48,CeedElemRestrictionApplyStridedTranspose_Memcheck_Core +FNL:4,218,240 +FNA:4,48,CeedElemRestrictionApplyOffsetTranspose_Memcheck_Core +FNL:5,242,264 +FNA:5,0,CeedElemRestrictionApplyOrientedTranspose_Memcheck_Core +FNL:6,266,315 +FNA:6,0,CeedElemRestrictionApplyCurlOrientedTranspose_Memcheck_Core +FNL:7,317,365 +FNA:7,0,CeedElemRestrictionApplyCurlOrientedUnsignedTranspose_Memcheck_Core +FNL:8,367,390 +FNA:8,0,CeedElemRestrictionApplyAtPointsInElement_Memcheck_Core +FNL:9,392,500 +FNA:9,240,CeedElemRestrictionApply_Memcheck_Core +FNL:10,46,70 +FNA:10,48,CeedElemRestrictionApplyStridedNoTranspose_Memcheck_Core +FNL:11,505,516 +FNA:11,240,CeedElemRestrictionApply_Memcheck +FNL:12,521,533 +FNA:12,0,CeedElemRestrictionApplyUnsigned_Memcheck +FNL:13,538,550 +FNA:13,0,CeedElemRestrictionApplyUnoriented_Memcheck +FNL:14,555,563 +FNA:14,0,CeedElemRestrictionApplyAtPointsInElement_Memcheck +FNL:15,568,579 +FNA:15,0,CeedElemRestrictionApplyBlock_Memcheck +FNL:16,584,593 +FNA:16,72,CeedElemRestrictionGetOffsets_Memcheck +FNL:17,598,607 +FNA:17,0,CeedElemRestrictionGetOrientations_Memcheck +FNL:18,612,621 +FNA:18,0,CeedElemRestrictionGetCurlOrientations_Memcheck +FNL:19,626,635 +FNA:19,192,CeedElemRestrictionDestroy_Memcheck +FNL:20,640,773 +FNA:20,264,CeedElemRestrictionCreate_Memcheck +FNL:21,72,88 +FNA:21,96,CeedElemRestrictionApplyOffsetNoTranspose_Memcheck_Core +FNL:22,90,107 +FNA:22,0,CeedElemRestrictionApplyOrientedNoTranspose_Memcheck_Core +FNF:23 +FNH:10 +DA:19,192 +DA:22,192 +DA:23,192 +DA:24,192 +DA:26,192 +DA:27,192 +DA:28,192 +DA:40,192 +DA:46,48 +DA:52,48 +DA:54,48 +DA:55,48 +DA:56,0 +DA:59,432 +DA:60,768 +DA:61,9888 +DA:62,34464 +DA:63,24960 +DA:64,24960 +DA:69,48 +DA:72,96 +DA:79,96 +DA:80,864 +DA:81,1656 +DA:82,55608 +DA:83,54720 +DA:87,96 +DA:90,0 +DA:97,0 +DA:98,0 +DA:99,0 +DA:100,0 +DA:101,0 +DA:102,0 +DA:106,0 +DA:109,0 +DA:116,0 +DA:117,0 +DA:118,0 +DA:119,0 +DA:121,0 +DA:122,0 +DA:123,0 +DA:124,0 +DA:125,0 +DA:126,0 +DA:128,0 +DA:129,0 +DA:130,0 +DA:131,0 +DA:132,0 +DA:133,0 +DA:134,0 +DA:135,0 +DA:136,0 +DA:139,0 +DA:140,0 +DA:141,0 +DA:142,0 +DA:143,0 +DA:144,0 +DA:148,0 +DA:151,0 +DA:157,0 +DA:158,0 +DA:159,0 +DA:160,0 +DA:162,0 +DA:163,0 +DA:164,0 +DA:165,0 +DA:166,0 +DA:167,0 +DA:169,0 +DA:170,0 +DA:171,0 +DA:172,0 +DA:173,0 +DA:174,0 +DA:175,0 +DA:176,0 +DA:177,0 +DA:180,0 +DA:181,0 +DA:182,0 +DA:183,0 +DA:184,0 +DA:185,0 +DA:189,0 +DA:192,48 +DA:198,48 +DA:200,48 +DA:201,48 +DA:202,0 +DA:205,432 +DA:206,768 +DA:207,9888 +DA:208,24096 +DA:209,14592 +DA:210,14592 +DA:215,48 +DA:218,48 +DA:225,48 +DA:226,432 +DA:227,768 +DA:228,6624 +DA:230,16000 +DA:233,9760 +DA:234,9760 +DA:239,48 +DA:242,0 +DA:249,0 +DA:250,0 +DA:251,0 +DA:252,0 +DA:254,0 +DA:257,0 +DA:258,0 +DA:263,0 +DA:266,0 +DA:269,0 +DA:272,0 +DA:274,0 +DA:275,0 +DA:276,0 +DA:278,0 +DA:279,0 +DA:281,0 +DA:282,0 +DA:283,0 +DA:284,0 +DA:285,0 +DA:287,0 +DA:288,0 +DA:290,0 +DA:291,0 +DA:292,0 +DA:293,0 +DA:294,0 +DA:295,0 +DA:296,0 +DA:297,0 +DA:299,0 +DA:300,0 +DA:303,0 +DA:304,0 +DA:305,0 +DA:306,0 +DA:307,0 +DA:309,0 +DA:310,0 +DA:314,0 +DA:317,0 +DA:319,0 +DA:322,0 +DA:324,0 +DA:325,0 +DA:326,0 +DA:328,0 +DA:329,0 +DA:331,0 +DA:332,0 +DA:333,0 +DA:334,0 +DA:335,0 +DA:337,0 +DA:338,0 +DA:340,0 +DA:341,0 +DA:342,0 +DA:343,0 +DA:344,0 +DA:345,0 +DA:346,0 +DA:347,0 +DA:349,0 +DA:350,0 +DA:353,0 +DA:354,0 +DA:355,0 +DA:356,0 +DA:357,0 +DA:359,0 +DA:360,0 +DA:364,0 +DA:367,0 +DA:371,0 +DA:374,0 +DA:375,0 +DA:376,0 +DA:377,0 +DA:378,0 +DA:379,0 +DA:380,0 +DA:383,0 +DA:384,0 +DA:387,0 +DA:389,0 +DA:392,240 +DA:401,240 +DA:402,240 +DA:403,240 +DA:404,240 +DA:405,240 +DA:407,240 +DA:409,96 +DA:412,144 +DA:415,240 +DA:421,96 +DA:422,48 +DA:423,48 +DA:425,48 +DA:426,48 +DA:427,48 +DA:429,48 +DA:430,0 +DA:431,0 +DA:432,0 +DA:435,0 +DA:438,0 +DA:439,0 +DA:440,0 +DA:441,0 +DA:443,0 +DA:444,0 +DA:447,0 +DA:450,0 +DA:451,0 +DA:452,0 +DA:453,0 +DA:461,144 +DA:462,48 +DA:463,48 +DA:465,48 +DA:466,96 +DA:467,96 +DA:469,96 +DA:470,0 +DA:471,0 +DA:472,0 +DA:475,0 +DA:478,0 +DA:479,0 +DA:480,0 +DA:481,0 +DA:483,0 +DA:484,0 +DA:487,0 +DA:490,0 +DA:491,0 +DA:492,0 +DA:493,0 +DA:496,240 +DA:497,240 +DA:498,240 +DA:499,240 +DA:505,240 +DA:509,240 +DA:510,240 +DA:511,240 +DA:512,240 +DA:513,240 +DA:514,240 +DA:515,240 +DA:521,0 +DA:526,0 +DA:527,0 +DA:528,0 +DA:529,0 +DA:530,0 +DA:531,0 +DA:532,0 +DA:538,0 +DA:543,0 +DA:544,0 +DA:545,0 +DA:546,0 +DA:547,0 +DA:548,0 +DA:549,0 +DA:555,0 +DA:560,0 +DA:561,0 +DA:562,0 +DA:568,0 +DA:573,0 +DA:574,0 +DA:575,0 +DA:576,0 +DA:577,0 +DA:578,0 +DA:584,72 +DA:587,72 +DA:589,72 +DA:591,72 +DA:592,72 +DA:598,0 +DA:601,0 +DA:603,0 +DA:605,0 +DA:606,0 +DA:612,0 +DA:615,0 +DA:617,0 +DA:619,0 +DA:620,0 +DA:626,192 +DA:629,192 +DA:630,192 +DA:631,192 +DA:632,192 +DA:633,192 +DA:634,192 +DA:640,264 +DA:643,264 +DA:647,264 +DA:648,264 +DA:649,264 +DA:650,264 +DA:651,264 +DA:652,264 +DA:653,264 +DA:654,264 +DA:656,264 +DA:658,264 +DA:659,264 +DA:664,264 +DA:666,264 +DA:667,264 +DA:668,96 +DA:669,96 +DA:670,96 +DA:671,96 +DA:677,264 +DA:678,0 +DA:680,0 +DA:681,0 +DA:683,0 +DA:684,0 +DA:687,0 +DA:688,0 +DA:692,264 +DA:697,168 +DA:698,34328 +DA:699,34160 +DA:705,168 +DA:706,168 +DA:707,168 +DA:708,96 +DA:709,96 +DA:710,96 +DA:711,96 +DA:712,96 +DA:713,72 +DA:714,72 +DA:715,72 +DA:716,72 +DA:717,0 +DA:718,0 +DA:722,168 +DA:723,0 +DA:724,0 +DA:725,0 +DA:726,0 +DA:727,0 +DA:728,0 +DA:729,0 +DA:730,0 +DA:731,0 +DA:732,0 +DA:733,0 +DA:734,0 +DA:735,0 +DA:737,168 +DA:738,0 +DA:739,0 +DA:740,0 +DA:741,0 +DA:742,0 +DA:743,0 +DA:744,0 +DA:745,0 +DA:746,0 +DA:747,0 +DA:748,0 +DA:749,0 +DA:750,0 +DA:756,264 +DA:759,264 +DA:760,264 +DA:761,264 +DA:762,264 +DA:763,0 +DA:766,264 +DA:767,264 +DA:768,264 +DA:769,264 +DA:770,264 +DA:771,264 +DA:772,264 +LF:400 +LH:145 +end_of_record +TN: +SF:/home/jeremy/Dev/libCEED/backends/memcheck/ceed-memcheck-serial.c +FNL:0,17,35 +FNA:0,24,CeedInit_Memcheck +FNL:1,40,40 +FNA:1,192,CeedRegister_Memcheck_Serial +FNF:2 +FNH:2 +DA:17,24 +DA:20,24 +DA:24,24 +DA:25,24 +DA:26,24 +DA:28,24 +DA:29,24 +DA:30,24 +DA:31,24 +DA:32,24 +DA:33,24 +DA:34,24 +DA:40,192 +LF:13 +LH:13 +end_of_record +TN: +SF:/home/jeremy/Dev/libCEED/backends/memcheck/ceed-memcheck-vector.c +FNL:0,102,113 +FNA:0,144,CeedVectorSetValue_Memcheck +FNL:1,118,130 +FNA:1,0,CeedVectorSetValueStrided_Memcheck +FNL:2,135,152 +FNA:2,2352,CeedVectorSyncArray_Memcheck +FNL:3,157,181 +FNA:3,0,CeedVectorTakeArray_Memcheck +FNL:4,186,201 +FNA:4,2352,CeedVectorGetArray_Memcheck +FNL:5,206,223 +FNA:5,3120,CeedVectorGetArrayRead_Memcheck +FNL:6,21,27 +FNA:6,3240,CeedVectorHasValidArray_Memcheck +FNL:7,228,247 +FNA:7,2232,CeedVectorGetArrayWrite_Memcheck +FNL:8,252,279 +FNA:8,2352,CeedVectorRestoreArray_Memcheck +FNL:9,284,301 +FNA:9,3120,CeedVectorRestoreArrayRead_Memcheck +FNL:10,306,317 +FNA:10,0,CeedVectorReciprocal_Memcheck +FNL:11,32,40 +FNA:11,0,CeedVectorHasBorrowedArrayOfType_Memcheck +FNL:12,322,331 +FNA:12,0,CeedVectorScale_Memcheck +FNL:13,336,346 +FNA:13,0,CeedVectorAXPY_Memcheck +FNL:14,351,361 +FNA:14,0,CeedVectorAXPBY_Memcheck +FNL:15,366,379 +FNA:15,0,CeedVectorPointwiseMult_Memcheck +FNL:16,384,402 +FNA:16,864,CeedVectorDestroy_Memcheck +FNL:17,407,434 +FNA:17,864,CeedVectorCreate_Memcheck +FNL:18,45,97 +FNA:18,2544,CeedVectorSetArray_Memcheck +FNF:19 +FNH:11 +DA:21,3240 +DA:24,3240 +DA:25,3240 +DA:26,3240 +DA:32,0 +DA:35,0 +DA:37,0 +DA:38,0 +DA:39,0 +DA:45,2544 +DA:49,2544 +DA:51,2544 +DA:52,2544 +DA:55,2544 +DA:56,31648 +DA:57,1680 +DA:59,2544 +DA:60,2544 +DA:61,1968 +DA:62,0 +DA:63,0 +DA:65,1968 +DA:69,2544 +DA:72,2544 +DA:73,576 +DA:75,576 +DA:76,0 +DA:77,0 +DA:78,0 +DA:79,0 +DA:80,0 +DA:81,1968 +DA:82,1968 +DA:83,1968 +DA:84,1968 +DA:85,1968 +DA:89,2544 +DA:90,2544 +DA:91,2544 +DA:92,1992 +DA:94,359032 +DA:96,2544 +DA:102,144 +DA:106,144 +DA:107,144 +DA:109,144 +DA:110,144 +DA:111,30960 +DA:112,144 +DA:118,0 +DA:122,0 +DA:123,0 +DA:125,0 +DA:126,0 +DA:127,0 +DA:128,0 +DA:129,0 +DA:135,2352 +DA:139,2352 +DA:141,2352 +DA:142,2352 +DA:145,2352 +DA:146,0 +DA:148,2352 +DA:149,792 +DA:151,2352 +DA:157,0 +DA:161,0 +DA:163,0 +DA:164,0 +DA:167,0 +DA:170,0 +DA:171,0 +DA:172,0 +DA:175,0 +DA:176,0 +DA:177,0 +DA:179,0 +DA:180,0 +DA:186,2352 +DA:190,2352 +DA:192,2352 +DA:193,2352 +DA:196,2352 +DA:197,2352 +DA:198,2352 +DA:199,2352 +DA:200,2352 +DA:206,3120 +DA:210,3120 +DA:212,3120 +DA:213,3120 +DA:216,3120 +DA:217,3120 +DA:218,3120 +DA:219,3120 +DA:221,3120 +DA:222,3120 +DA:228,2232 +DA:232,2232 +DA:234,2232 +DA:235,2232 +DA:238,2232 +DA:241,2232 +DA:244,417752 +DA:245,2232 +DA:246,2232 +DA:252,2352 +DA:256,2352 +DA:257,2352 +DA:260,2352 +DA:261,417752 +DA:262,415520 +DA:263,0 +DA:267,2232 +DA:271,2352 +DA:272,2352 +DA:275,449456 +DA:276,2352 +DA:277,2352 +DA:278,2352 +DA:284,3120 +DA:288,3120 +DA:289,3120 +DA:292,3120 +DA:294,3120 +DA:297,503680 +DA:298,3120 +DA:299,3120 +DA:300,3120 +DA:306,0 +DA:310,0 +DA:311,0 +DA:313,0 +DA:314,0 +DA:316,0 +DA:322,0 +DA:326,0 +DA:327,0 +DA:329,0 +DA:330,0 +DA:336,0 +DA:340,0 +DA:341,0 +DA:342,0 +DA:344,0 +DA:345,0 +DA:351,0 +DA:355,0 +DA:356,0 +DA:357,0 +DA:359,0 +DA:360,0 +DA:366,0 +DA:370,0 +DA:371,0 +DA:372,0 +DA:373,0 +DA:375,0 +DA:376,0 +DA:377,0 +DA:378,0 +DA:384,864 +DA:388,864 +DA:389,864 +DA:390,864 +DA:391,864 +DA:393,864 +DA:394,0 +DA:395,0 +DA:397,864 +DA:398,288 +DA:400,864 +DA:401,864 +DA:407,864 +DA:411,864 +DA:412,864 +DA:413,864 +DA:414,864 +DA:415,864 +DA:416,864 +DA:417,864 +DA:418,864 +DA:419,864 +DA:420,864 +DA:421,864 +DA:422,864 +DA:423,864 +DA:424,864 +DA:425,864 +DA:426,864 +DA:427,864 +DA:428,864 +DA:429,864 +DA:430,864 +DA:431,864 +DA:432,864 +DA:433,864 +LF:198 +LH:129 +end_of_record +TN: +SF:/home/jeremy/Dev/libCEED/backends/opt/ceed-opt-blocked.c +FNL:0,18,24 +FNA:0,48,CeedDestroy_Opt +FNL:1,29,51 +FNA:1,48,CeedInit_Opt_Blocked +FNL:2,56,56 +FNA:2,192,CeedRegister_Opt_Blocked +FNF:3 +FNH:3 +DA:18,48 +DA:21,48 +DA:22,48 +DA:23,48 +DA:29,48 +DA:33,48 +DA:35,48 +DA:38,48 +DA:39,48 +DA:40,48 +DA:42,48 +DA:43,48 +DA:44,48 +DA:47,48 +DA:48,48 +DA:49,48 +DA:50,48 +DA:56,192 +LF:18 +LH:18 +end_of_record +TN: +SF:/home/jeremy/Dev/libCEED/backends/opt/ceed-opt-operator.c +FNL:0,19,200 +FNA:0,384,CeedOperatorSetupFields_Opt +FNL:1,205,272 +FNA:1,192,CeedOperatorSetup_Opt +FNL:2,277,312 +FNA:2,192,CeedOperatorSetupInputs_Opt +FNL:3,317,368 +FNA:3,1536,CeedOperatorInputBasis_Opt +FNL:4,373,418 +FNA:4,1536,CeedOperatorOutputBasis_Opt +FNL:5,423,437 +FNA:5,192,CeedOperatorRestoreInputs_Opt +FNL:6,442,513 +FNA:6,192,CeedOperatorApplyAdd_Opt +FNL:7,518,726 +FNA:7,0,CeedOperatorLinearAssembleQFunctionCore_Opt +FNL:8,731,733 +FNA:8,0,CeedOperatorLinearAssembleQFunction_Opt +FNL:9,738,740 +FNA:9,0,CeedOperatorLinearAssembleQFunctionUpdate_Opt +FNL:10,745,780 +FNA:10,192,CeedOperatorDestroy_Opt +FNL:11,785,805 +FNA:11,192,CeedOperatorCreate_Opt +FNF:12 +FNH:9 +DA:19,384 +DA:31,384 +DA:32,384 +DA:33,384 +DA:34,384 +DA:36,384 +DA:37,192 +DA:38,192 +DA:40,192 +DA:41,192 +DA:45,960 +DA:49,576 +DA:50,576 +DA:57,480 +DA:58,480 +DA:59,480 +DA:60,480 +DA:61,480 +DA:62,480 +DA:63,480 +DA:65,480 +DA:66,480 +DA:67,288 +DA:68,288 +DA:70,288 +DA:71,288 +DA:73,288 +DA:74,288 +DA:75,0 +DA:76,0 +DA:77,0 +DA:79,0 +DA:80,0 +DA:81,0 +DA:83,0 +DA:84,0 +DA:85,0 +DA:86,0 +DA:87,0 +DA:88,0 +DA:90,0 +DA:91,0 +DA:92,0 +DA:95,0 +DA:96,0 +DA:97,0 +DA:98,192 +DA:101,192 +DA:102,192 +DA:104,192 +DA:105,0 +DA:107,0 +DA:109,480 +DA:110,480 +DA:111,480 +DA:114,576 +DA:115,192 +DA:116,192 +DA:117,192 +DA:118,192 +DA:119,192 +DA:120,192 +DA:121,192 +DA:122,288 +DA:126,288 +DA:127,288 +DA:128,288 +DA:129,288 +DA:130,288 +DA:131,288 +DA:132,288 +DA:133,288 +DA:134,288 +DA:135,288 +DA:136,96 +DA:137,96 +DA:138,96 +DA:139,96 +DA:140,96 +DA:141,96 +DA:142,96 +DA:145,576 +DA:148,384 +DA:149,576 +DA:153,384 +DA:154,384 +DA:155,576 +DA:159,192 +DA:160,192 +DA:161,192 +DA:162,0 +DA:163,0 +DA:164,0 +DA:166,192 +DA:167,192 +DA:169,384 +DA:170,384 +DA:173,384 +DA:177,192 +DA:178,192 +DA:179,192 +DA:183,0 +DA:184,0 +DA:185,0 +DA:186,0 +DA:187,0 +DA:188,0 +DA:189,0 +DA:191,0 +DA:192,0 +DA:194,192 +DA:195,192 +DA:198,384 +DA:199,384 +DA:205,192 +DA:215,192 +DA:216,192 +DA:218,192 +DA:219,192 +DA:220,192 +DA:221,192 +DA:222,192 +DA:223,192 +DA:224,192 +DA:225,192 +DA:226,192 +DA:227,192 +DA:230,192 +DA:231,192 +DA:233,192 +DA:234,192 +DA:235,192 +DA:236,192 +DA:237,192 +DA:238,192 +DA:239,192 +DA:240,192 +DA:242,192 +DA:243,192 +DA:247,192 +DA:250,192 +DA:254,192 +DA:258,0 +DA:259,0 +DA:260,0 +DA:262,0 +DA:263,0 +DA:265,0 +DA:269,192 +DA:270,192 +DA:271,192 +DA:277,192 +DA:280,576 +DA:283,384 +DA:284,384 +DA:290,288 +DA:291,288 +DA:293,96 +DA:294,96 +DA:295,96 +DA:297,96 +DA:299,96 +DA:302,192 +DA:303,0 +DA:304,0 +DA:305,0 +DA:308,288 +DA:311,192 +DA:317,1536 +DA:320,4608 +DA:329,3072 +DA:330,3072 +DA:331,3072 +DA:332,3072 +DA:335,3072 +DA:336,3072 +DA:337,3072 +DA:338,3072 +DA:339,3072 +DA:341,3072 +DA:342,1536 +DA:345,3072 +DA:346,768 +DA:347,768 +DA:348,768 +DA:350,768 +DA:351,1536 +DA:355,1536 +DA:356,1536 +DA:357,0 +DA:358,0 +DA:360,1536 +DA:361,1536 +DA:362,1536 +DA:363,768 +DA:364,768 +DA:367,1536 +DA:373,1536 +DA:376,3072 +DA:383,1536 +DA:385,1536 +DA:386,768 +DA:387,768 +DA:388,768 +DA:392,768 +DA:393,768 +DA:394,0 +DA:396,768 +DA:398,768 +DA:399,768 +DA:407,1536 +DA:409,1536 +DA:410,1536 +DA:411,1536 +DA:413,1536 +DA:415,1536 +DA:417,1536 +DA:423,192 +DA:425,576 +DA:429,384 +DA:430,384 +DA:431,384 +DA:432,96 +DA:434,384 +DA:436,192 +DA:442,192 +DA:447,192 +DA:454,192 +DA:456,192 +DA:457,192 +DA:458,192 +DA:459,192 +DA:460,192 +DA:461,192 +DA:462,192 +DA:465,192 +DA:466,0 +DA:467,0 +DA:468,0 +DA:470,0 +DA:473,192 +DA:474,192 +DA:475,192 +DA:476,192 +DA:479,192 +DA:482,384 +DA:484,192 +DA:485,192 +DA:487,96 +DA:488,96 +DA:489,96 +DA:494,1728 +DA:496,1536 +DA:500,1536 +DA:501,1536 +DA:505,1536 +DA:510,192 +DA:511,192 +DA:512,192 +DA:518,0 +DA:523,0 +DA:529,0 +DA:530,0 +DA:531,0 +DA:532,0 +DA:533,0 +DA:535,0 +DA:536,0 +DA:537,0 +DA:538,0 +DA:539,0 +DA:540,0 +DA:541,0 +DA:542,0 +DA:543,0 +DA:546,0 +DA:549,0 +DA:552,0 +DA:555,0 +DA:556,0 +DA:561,0 +DA:562,0 +DA:563,0 +DA:564,0 +DA:565,0 +DA:567,0 +DA:569,0 +DA:570,0 +DA:574,0 +DA:575,0 +DA:580,0 +DA:581,0 +DA:582,0 +DA:583,0 +DA:585,0 +DA:587,0 +DA:588,0 +DA:592,0 +DA:593,0 +DA:595,0 +DA:596,0 +DA:597,0 +DA:601,0 +DA:602,0 +DA:604,0 +DA:606,0 +DA:610,0 +DA:611,0 +DA:612,0 +DA:615,0 +DA:618,0 +DA:622,0 +DA:623,0 +DA:624,0 +DA:627,0 +DA:631,0 +DA:637,0 +DA:638,0 +DA:639,0 +DA:640,0 +DA:641,0 +DA:642,0 +DA:647,0 +DA:648,0 +DA:649,0 +DA:652,0 +DA:654,0 +DA:658,0 +DA:659,0 +DA:662,0 +DA:663,0 +DA:664,0 +DA:666,0 +DA:669,0 +DA:675,0 +DA:676,0 +DA:677,0 +DA:678,0 +DA:679,0 +DA:685,0 +DA:686,0 +DA:687,0 +DA:693,0 +DA:694,0 +DA:698,0 +DA:699,0 +DA:700,0 +DA:702,0 +DA:707,0 +DA:708,0 +DA:712,0 +DA:716,0 +DA:717,0 +DA:718,0 +DA:722,0 +DA:723,0 +DA:724,0 +DA:725,0 +DA:731,0 +DA:732,0 +DA:738,0 +DA:739,0 +DA:745,192 +DA:748,192 +DA:749,768 +DA:750,576 +DA:751,576 +DA:753,192 +DA:754,192 +DA:755,192 +DA:756,192 +DA:757,192 +DA:758,192 +DA:760,576 +DA:761,384 +DA:762,384 +DA:764,192 +DA:765,192 +DA:767,384 +DA:768,192 +DA:769,192 +DA:771,192 +DA:772,192 +DA:775,192 +DA:776,192 +DA:778,192 +DA:779,192 +DA:785,192 +DA:790,192 +DA:791,192 +DA:792,192 +DA:794,192 +DA:795,192 +DA:797,192 +DA:799,192 +DA:800,192 +DA:801,192 +DA:802,192 +DA:803,192 +DA:804,192 +LF:400 +LH:249 +end_of_record +TN: +SF:/home/jeremy/Dev/libCEED/backends/opt/ceed-opt-serial.c +FNL:0,18,24 +FNA:0,48,CeedDestroy_Opt +FNL:1,29,51 +FNA:1,48,CeedInit_Opt_Serial +FNL:2,56,56 +FNA:2,192,CeedRegister_Opt_Serial +FNF:3 +FNH:3 +DA:18,48 +DA:21,48 +DA:22,48 +DA:23,48 +DA:29,48 +DA:33,48 +DA:35,48 +DA:38,48 +DA:39,48 +DA:40,48 +DA:42,48 +DA:43,48 +DA:44,48 +DA:47,48 +DA:48,48 +DA:49,48 +DA:50,48 +DA:56,192 +LF:18 +LH:18 +end_of_record +TN: +SF:/home/jeremy/Dev/libCEED/backends/opt/ceed-opt-tensor.c +FNL:0,16,35 +FNA:0,2016,CeedTensorContractApply_Core_Opt +FNL:1,40,49 +FNA:1,2016,CeedTensorContractApply_Opt +FNL:2,54,57 +FNA:2,96,CeedTensorContractCreate_Opt +FNF:3 +FNH:3 +DA:16,2016 +DA:19,2016 +DA:21,2016 +DA:22,504 +DA:23,504 +DA:26,13080 +DA:27,72600 +DA:28,419808 +DA:29,358272 +DA:30,2727456 +DA:34,2016 +DA:40,2016 +DA:42,2016 +DA:43,415104 +DA:46,2016 +DA:47,672 +DA:54,96 +DA:55,96 +DA:56,96 +LF:19 +LH:19 +end_of_record +TN: +SF:/home/jeremy/Dev/libCEED/backends/ref/ceed-ref-basis.c +FNL:0,19,251 +FNA:0,4800,CeedBasisApplyCore_Ref +FNL:1,253,256 +FNA:1,4800,CeedBasisApply_Ref +FNL:2,258,261 +FNA:2,0,CeedBasisApplyAdd_Ref +FNL:3,266,273 +FNA:3,384,CeedBasisDestroyTensor_Ref +FNL:4,278,306 +FNA:4,384,CeedBasisCreateTensorH1_Ref +FNL:5,311,328 +FNA:5,0,CeedBasisCreateH1_Ref +FNL:6,333,350 +FNA:6,0,CeedBasisCreateHdiv_Ref +FNL:7,355,372 +FNA:7,0,CeedBasisCreateHcurl_Ref +FNF:8 +FNH:4 +DA:19,4800 +DA:21,4800 +DA:28,4800 +DA:29,4800 +DA:30,4800 +DA:31,4800 +DA:32,4800 +DA:33,4800 +DA:34,4800 +DA:35,4800 +DA:36,192 +DA:38,4800 +DA:39,4800 +DA:41,4800 +DA:44,1536 +DA:45,64576 +DA:48,4800 +DA:49,4800 +DA:53,4800 +DA:54,4800 +DA:55,4800 +DA:57,3072 +DA:58,3072 +DA:59,0 +DA:60,3072 +DA:61,3072 +DA:63,3072 +DA:64,1536 +DA:65,1536 +DA:67,3072 +DA:68,3072 +DA:71,3072 +DA:72,7104 +DA:73,4032 +DA:75,4032 +DA:76,4032 +DA:79,3072 +DA:81,1536 +DA:86,1536 +DA:88,1536 +DA:89,0 +DA:90,0 +DA:92,1536 +DA:95,1536 +DA:96,1536 +DA:97,1536 +DA:98,1536 +DA:102,3552 +DA:103,2016 +DA:107,2016 +DA:108,2016 +DA:112,1536 +DA:113,1536 +DA:114,0 +DA:115,0 +DA:117,1536 +DA:118,3552 +DA:119,2016 +DA:124,2016 +DA:125,2016 +DA:127,0 +DA:130,0 +DA:133,0 +DA:135,0 +DA:136,0 +DA:139,0 +DA:140,0 +DA:142,0 +DA:145,0 +DA:147,0 +DA:148,0 +DA:149,0 +DA:151,0 +DA:154,0 +DA:155,0 +DA:157,0 +DA:158,0 +DA:162,0 +DA:163,0 +DA:167,1536 +DA:169,192 +DA:170,192 +DA:173,192 +DA:174,192 +DA:175,576 +DA:176,384 +DA:178,3648 +DA:179,22848 +DA:180,66048 +DA:181,46464 +DA:183,255552 +DA:188,192 +DA:199,0 +DA:201,0 +DA:203,0 +DA:206,0 +DA:207,0 +DA:208,0 +DA:210,0 +DA:213,0 +DA:214,0 +DA:215,0 +DA:217,0 +DA:220,0 +DA:221,0 +DA:222,0 +DA:224,0 +DA:227,0 +DA:228,0 +DA:229,0 +DA:231,0 +DA:234,0 +DA:235,0 +DA:236,0 +DA:237,0 +DA:239,0 +DA:246,4800 +DA:247,4608 +DA:249,4800 +DA:250,4800 +DA:253,4800 +DA:254,4800 +DA:255,4800 +DA:258,0 +DA:259,0 +DA:260,0 +DA:266,384 +DA:269,384 +DA:270,384 +DA:271,384 +DA:272,384 +DA:278,384 +DA:284,384 +DA:285,384 +DA:287,384 +DA:289,384 +DA:290,384 +DA:291,384 +DA:292,384 +DA:294,384 +DA:296,384 +DA:297,384 +DA:298,384 +DA:300,384 +DA:301,384 +DA:302,384 +DA:303,384 +DA:304,384 +DA:305,384 +DA:311,0 +DA:316,0 +DA:317,0 +DA:319,0 +DA:320,0 +DA:321,0 +DA:323,0 +DA:324,0 +DA:325,0 +DA:326,0 +DA:327,0 +DA:333,0 +DA:338,0 +DA:339,0 +DA:341,0 +DA:342,0 +DA:343,0 +DA:345,0 +DA:346,0 +DA:347,0 +DA:348,0 +DA:349,0 +DA:355,0 +DA:360,0 +DA:361,0 +DA:363,0 +DA:364,0 +DA:365,0 +DA:367,0 +DA:368,0 +DA:369,0 +DA:370,0 +DA:371,0 +LF:182 +LH:98 +end_of_record +TN: +SF:/home/jeremy/Dev/libCEED/backends/ref/ceed-ref-operator.c +FNL:0,1025,1084 +FNA:0,0,CeedOperatorApplyAddAtPoints_Ref +FNL:1,1089,1310 +FNA:1,0,CeedOperatorLinearAssembleQFunctionAtPointsCore_Ref +FNL:2,1315,1317 +FNA:2,0,CeedOperatorLinearAssembleQFunctionAtPoints_Ref +FNL:3,1322,1325 +FNA:3,0,CeedOperatorLinearAssembleQFunctionAtPointsUpdate_Ref +FNL:4,1330,1528 +FNA:4,0,CeedOperatorLinearAssembleAddDiagonalAtPoints_Ref +FNL:5,145,206 +FNA:5,96,CeedOperatorSetup_Ref +FNL:6,1533,1735 +FNA:6,0,CeedOperatorAssembleSingleAtPoints_Ref +FNL:7,1740,1771 +FNA:7,96,CeedOperatorDestroy_Ref +FNL:8,1776,1789 +FNA:8,96,CeedOperatorCreate_Ref +FNL:9,1794,1810 +FNA:9,0,CeedOperatorCreateAtPoints_Ref +FNL:10,19,140 +FNA:10,192,CeedOperatorSetupFields_Ref +FNL:11,211,249 +FNA:11,96,CeedOperatorSetupInputs_Ref +FNL:12,254,299 +FNA:12,1344,CeedOperatorInputBasis_Ref +FNL:13,304,345 +FNA:13,1344,CeedOperatorOutputBasis_Ref +FNL:14,350,373 +FNA:14,96,CeedOperatorRestoreInputs_Ref +FNL:15,378,473 +FNA:15,96,CeedOperatorApplyAdd_Ref +FNL:16,478,654 +FNA:16,0,CeedOperatorLinearAssembleQFunctionCore_Ref +FNL:17,659,661 +FNA:17,0,CeedOperatorLinearAssembleQFunction_Ref +FNL:18,666,668 +FNA:18,0,CeedOperatorLinearAssembleQFunctionUpdate_Ref +FNL:19,673,827 +FNA:19,0,CeedOperatorSetupFieldsAtPoints_Ref +FNL:20,832,882 +FNA:20,0,CeedOperatorSetupAtPoints_Ref +FNL:21,887,949 +FNA:21,0,CeedOperatorInputBasisAtPoints_Ref +FNL:22,954,1020 +FNA:22,0,CeedOperatorOutputBasisAtPoints_Ref +FNF:23 +FNH:9 +DA:19,192 +DA:31,192 +DA:32,192 +DA:33,192 +DA:34,192 +DA:36,192 +DA:37,96 +DA:38,96 +DA:40,96 +DA:41,96 +DA:45,480 +DA:50,288 +DA:51,288 +DA:52,240 +DA:53,240 +DA:54,240 +DA:57,288 +DA:58,96 +DA:59,96 +DA:60,96 +DA:61,96 +DA:62,96 +DA:63,144 +DA:67,144 +DA:68,144 +DA:69,144 +DA:70,144 +DA:71,144 +DA:72,144 +DA:73,144 +DA:74,144 +DA:75,144 +DA:76,144 +DA:77,48 +DA:78,48 +DA:79,48 +DA:80,48 +DA:81,48 +DA:82,48 +DA:83,48 +DA:87,192 +DA:88,288 +DA:92,192 +DA:93,192 +DA:94,288 +DA:98,96 +DA:99,96 +DA:100,96 +DA:101,0 +DA:102,0 +DA:103,0 +DA:105,96 +DA:106,96 +DA:108,192 +DA:109,192 +DA:112,192 +DA:116,96 +DA:117,96 +DA:118,96 +DA:122,0 +DA:123,0 +DA:124,0 +DA:125,0 +DA:126,0 +DA:127,0 +DA:128,0 +DA:129,0 +DA:131,0 +DA:132,0 +DA:134,96 +DA:135,96 +DA:138,192 +DA:139,192 +DA:145,96 +DA:153,96 +DA:154,96 +DA:156,96 +DA:157,96 +DA:158,96 +DA:159,96 +DA:160,96 +DA:161,96 +DA:164,96 +DA:166,96 +DA:167,96 +DA:168,96 +DA:169,96 +DA:170,96 +DA:171,96 +DA:172,96 +DA:173,96 +DA:174,96 +DA:176,96 +DA:177,96 +DA:181,96 +DA:184,96 +DA:188,96 +DA:192,0 +DA:193,0 +DA:194,0 +DA:196,0 +DA:197,0 +DA:199,0 +DA:203,96 +DA:204,96 +DA:205,96 +DA:211,96 +DA:214,288 +DA:221,192 +DA:222,192 +DA:223,192 +DA:224,96 +DA:225,96 +DA:228,192 +DA:230,192 +DA:233,144 +DA:235,144 +DA:238,144 +DA:239,144 +DA:240,144 +DA:242,144 +DA:244,144 +DA:246,192 +DA:248,96 +DA:254,1344 +DA:257,4032 +DA:264,2688 +DA:268,0 +DA:269,0 +DA:270,0 +DA:271,0 +DA:274,2688 +DA:275,2688 +DA:276,2688 +DA:277,2688 +DA:278,2688 +DA:280,2688 +DA:281,672 +DA:282,672 +DA:283,672 +DA:284,1344 +DA:288,1344 +DA:289,1344 +DA:290,1344 +DA:291,1344 +DA:292,1344 +DA:293,1344 +DA:294,672 +DA:295,672 +DA:298,1344 +DA:304,1344 +DA:307,2688 +DA:314,1344 +DA:315,1344 +DA:316,1344 +DA:317,1344 +DA:319,1344 +DA:320,672 +DA:321,672 +DA:322,672 +DA:326,672 +DA:327,672 +DA:328,672 +DA:330,672 +DA:331,0 +DA:333,672 +DA:335,672 +DA:336,672 +DA:344,1344 +DA:350,96 +DA:352,288 +DA:356,192 +DA:360,0 +DA:361,0 +DA:362,0 +DA:363,0 +DA:366,192 +DA:367,192 +DA:369,144 +DA:372,96 +DA:378,96 +DA:381,96 +DA:388,96 +DA:390,96 +DA:391,96 +DA:394,96 +DA:397,0 +DA:398,0 +DA:399,0 +DA:400,0 +DA:401,0 +DA:402,0 +DA:403,0 +DA:406,96 +DA:407,96 +DA:408,96 +DA:409,96 +DA:412,96 +DA:415,192 +DA:416,96 +DA:417,0 +DA:419,96 +DA:424,1440 +DA:426,2688 +DA:427,1344 +DA:428,1344 +DA:429,672 +DA:430,672 +DA:436,1344 +DA:439,1344 +DA:440,1344 +DA:444,1344 +DA:449,192 +DA:454,96 +DA:456,96 +DA:458,96 +DA:460,96 +DA:461,96 +DA:463,96 +DA:464,96 +DA:465,96 +DA:466,96 +DA:470,96 +DA:471,96 +DA:472,96 +DA:478,0 +DA:482,0 +DA:488,0 +DA:489,0 +DA:490,0 +DA:491,0 +DA:492,0 +DA:493,0 +DA:494,0 +DA:495,0 +DA:496,0 +DA:499,0 +DA:502,0 +DA:505,0 +DA:508,0 +DA:509,0 +DA:514,0 +DA:516,0 +DA:517,0 +DA:518,0 +DA:519,0 +DA:521,0 +DA:523,0 +DA:524,0 +DA:528,0 +DA:529,0 +DA:534,0 +DA:536,0 +DA:537,0 +DA:538,0 +DA:540,0 +DA:542,0 +DA:543,0 +DA:547,0 +DA:548,0 +DA:549,0 +DA:552,0 +DA:555,0 +DA:558,0 +DA:559,0 +DA:562,0 +DA:564,0 +DA:568,0 +DA:574,0 +DA:575,0 +DA:576,0 +DA:577,0 +DA:578,0 +DA:579,0 +DA:584,0 +DA:585,0 +DA:586,0 +DA:589,0 +DA:591,0 +DA:595,0 +DA:597,0 +DA:600,0 +DA:601,0 +DA:602,0 +DA:604,0 +DA:607,0 +DA:613,0 +DA:614,0 +DA:615,0 +DA:616,0 +DA:617,0 +DA:623,0 +DA:624,0 +DA:625,0 +DA:632,0 +DA:633,0 +DA:637,0 +DA:639,0 +DA:640,0 +DA:642,0 +DA:647,0 +DA:650,0 +DA:651,0 +DA:652,0 +DA:653,0 +DA:659,0 +DA:660,0 +DA:666,0 +DA:667,0 +DA:673,0 +DA:685,0 +DA:686,0 +DA:687,0 +DA:688,0 +DA:690,0 +DA:691,0 +DA:692,0 +DA:694,0 +DA:695,0 +DA:701,0 +DA:704,0 +DA:705,0 +DA:706,0 +DA:707,0 +DA:708,0 +DA:709,0 +DA:710,0 +DA:711,0 +DA:716,0 +DA:720,0 +DA:721,0 +DA:724,0 +DA:725,0 +DA:726,0 +DA:727,0 +DA:730,0 +DA:731,0 +DA:734,0 +DA:735,0 +DA:736,0 +DA:737,0 +DA:738,0 +DA:739,0 +DA:741,0 +DA:742,0 +DA:744,0 +DA:745,0 +DA:747,0 +DA:751,0 +DA:752,0 +DA:753,0 +DA:754,0 +DA:755,0 +DA:756,0 +DA:757,0 +DA:758,0 +DA:759,0 +DA:760,0 +DA:761,0 +DA:762,0 +DA:763,0 +DA:764,0 +DA:765,0 +DA:767,0 +DA:768,0 +DA:771,0 +DA:772,0 +DA:775,0 +DA:776,0 +DA:780,0 +DA:781,0 +DA:782,0 +DA:786,0 +DA:787,0 +DA:788,0 +DA:789,0 +DA:790,0 +DA:791,0 +DA:793,0 +DA:794,0 +DA:796,0 +DA:797,0 +DA:800,0 +DA:804,0 +DA:805,0 +DA:806,0 +DA:810,0 +DA:811,0 +DA:812,0 +DA:813,0 +DA:814,0 +DA:815,0 +DA:816,0 +DA:818,0 +DA:819,0 +DA:821,0 +DA:822,0 +DA:825,0 +DA:826,0 +DA:832,0 +DA:840,0 +DA:841,0 +DA:843,0 +DA:844,0 +DA:845,0 +DA:846,0 +DA:847,0 +DA:848,0 +DA:851,0 +DA:853,0 +DA:854,0 +DA:855,0 +DA:856,0 +DA:857,0 +DA:858,0 +DA:859,0 +DA:860,0 +DA:862,0 +DA:863,0 +DA:867,0 +DA:870,0 +DA:874,0 +DA:875,0 +DA:876,0 +DA:879,0 +DA:880,0 +DA:881,0 +DA:887,0 +DA:891,0 +DA:901,0 +DA:902,0 +DA:903,0 +DA:904,0 +DA:905,0 +DA:908,0 +DA:909,0 +DA:910,0 +DA:911,0 +DA:914,0 +DA:915,0 +DA:916,0 +DA:918,0 +DA:922,0 +DA:923,0 +DA:924,0 +DA:925,0 +DA:927,0 +DA:929,0 +DA:933,0 +DA:934,0 +DA:935,0 +DA:936,0 +DA:937,0 +DA:939,0 +DA:941,0 +DA:942,0 +DA:943,0 +DA:944,0 +DA:946,0 +DA:948,0 +DA:954,0 +DA:958,0 +DA:967,0 +DA:968,0 +DA:969,0 +DA:970,0 +DA:973,0 +DA:974,0 +DA:976,0 +DA:977,0 +DA:978,0 +DA:979,0 +DA:983,0 +DA:984,0 +DA:985,0 +DA:988,0 +DA:991,0 +DA:992,0 +DA:1001,0 +DA:1002,0 +DA:1003,0 +DA:1007,0 +DA:1008,0 +DA:1009,0 +DA:1011,0 +DA:1012,0 +DA:1014,0 +DA:1016,0 +DA:1017,0 +DA:1019,0 +DA:1025,0 +DA:1026,0 +DA:1027,0 +DA:1028,0 +DA:1029,0 +DA:1035,0 +DA:1036,0 +DA:1037,0 +DA:1038,0 +DA:1039,0 +DA:1042,0 +DA:1045,0 +DA:1048,0 +DA:1051,0 +DA:1055,0 +DA:1056,0 +DA:1057,0 +DA:1060,0 +DA:1064,0 +DA:1065,0 +DA:1069,0 +DA:1073,0 +DA:1077,0 +DA:1080,0 +DA:1081,0 +DA:1082,0 +DA:1083,0 +DA:1089,0 +DA:1092,0 +DA:1093,0 +DA:1094,0 +DA:1099,0 +DA:1101,0 +DA:1102,0 +DA:1103,0 +DA:1104,0 +DA:1105,0 +DA:1106,0 +DA:1107,0 +DA:1108,0 +DA:1111,0 +DA:1114,0 +DA:1117,0 +DA:1118,0 +DA:1121,0 +DA:1124,0 +DA:1125,0 +DA:1130,0 +DA:1132,0 +DA:1136,0 +DA:1138,0 +DA:1139,0 +DA:1140,0 +DA:1141,0 +DA:1144,0 +DA:1145,0 +DA:1147,0 +DA:1149,0 +DA:1150,0 +DA:1154,0 +DA:1155,0 +DA:1160,0 +DA:1162,0 +DA:1166,0 +DA:1168,0 +DA:1169,0 +DA:1170,0 +DA:1171,0 +DA:1174,0 +DA:1175,0 +DA:1176,0 +DA:1178,0 +DA:1180,0 +DA:1181,0 +DA:1185,0 +DA:1189,0 +DA:1192,0 +DA:1193,0 +DA:1195,0 +DA:1198,0 +DA:1201,0 +DA:1202,0 +DA:1205,0 +DA:1209,0 +DA:1210,0 +DA:1211,0 +DA:1214,0 +DA:1218,0 +DA:1224,0 +DA:1225,0 +DA:1226,0 +DA:1228,0 +DA:1230,0 +DA:1231,0 +DA:1236,0 +DA:1237,0 +DA:1238,0 +DA:1241,0 +DA:1243,0 +DA:1248,0 +DA:1250,0 +DA:1251,0 +DA:1252,0 +DA:1253,0 +DA:1255,0 +DA:1258,0 +DA:1264,0 +DA:1265,0 +DA:1266,0 +DA:1267,0 +DA:1268,0 +DA:1274,0 +DA:1275,0 +DA:1276,0 +DA:1280,0 +DA:1284,0 +DA:1285,0 +DA:1289,0 +DA:1291,0 +DA:1292,0 +DA:1294,0 +DA:1299,0 +DA:1302,0 +DA:1305,0 +DA:1306,0 +DA:1307,0 +DA:1308,0 +DA:1309,0 +DA:1315,0 +DA:1316,0 +DA:1322,0 +DA:1324,0 +DA:1330,0 +DA:1331,0 +DA:1332,0 +DA:1334,0 +DA:1335,0 +DA:1341,0 +DA:1342,0 +DA:1343,0 +DA:1344,0 +DA:1345,0 +DA:1348,0 +DA:1354,0 +DA:1355,0 +DA:1356,0 +DA:1357,0 +DA:1361,0 +DA:1367,0 +DA:1368,0 +DA:1369,0 +DA:1370,0 +DA:1374,0 +DA:1378,0 +DA:1379,0 +DA:1380,0 +DA:1381,0 +DA:1382,0 +DA:1386,0 +DA:1389,0 +DA:1390,0 +DA:1393,0 +DA:1394,0 +DA:1395,0 +DA:1398,0 +DA:1402,0 +DA:1403,0 +DA:1404,0 +DA:1410,0 +DA:1411,0 +DA:1412,0 +DA:1413,0 +DA:1416,0 +DA:1417,0 +DA:1418,0 +DA:1419,0 +DA:1420,0 +DA:1421,0 +DA:1422,0 +DA:1424,0 +DA:1425,0 +DA:1430,0 +DA:1431,0 +DA:1432,0 +DA:1433,0 +DA:1436,0 +DA:1440,0 +DA:1441,0 +DA:1445,0 +DA:1450,0 +DA:1452,0 +DA:1458,0 +DA:1459,0 +DA:1460,0 +DA:1461,0 +DA:1464,0 +DA:1465,0 +DA:1466,0 +DA:1467,0 +DA:1468,0 +DA:1470,0 +DA:1471,0 +DA:1473,0 +DA:1476,0 +DA:1478,0 +DA:1479,0 +DA:1480,0 +DA:1481,0 +DA:1486,0 +DA:1488,0 +DA:1489,0 +DA:1490,0 +DA:1491,0 +DA:1492,0 +DA:1493,0 +DA:1494,0 +DA:1497,0 +DA:1498,0 +DA:1500,0 +DA:1502,0 +DA:1505,0 +DA:1508,0 +DA:1509,0 +DA:1510,0 +DA:1514,0 +DA:1518,0 +DA:1521,0 +DA:1522,0 +DA:1523,0 +DA:1524,0 +DA:1525,0 +DA:1526,0 +DA:1527,0 +DA:1533,0 +DA:1534,0 +DA:1535,0 +DA:1537,0 +DA:1538,0 +DA:1544,0 +DA:1545,0 +DA:1546,0 +DA:1547,0 +DA:1548,0 +DA:1551,0 +DA:1557,0 +DA:1558,0 +DA:1559,0 +DA:1560,0 +DA:1564,0 +DA:1570,0 +DA:1571,0 +DA:1572,0 +DA:1573,0 +DA:1577,0 +DA:1580,0 +DA:1584,0 +DA:1585,0 +DA:1586,0 +DA:1587,0 +DA:1588,0 +DA:1592,0 +DA:1595,0 +DA:1596,0 +DA:1599,0 +DA:1601,0 +DA:1602,0 +DA:1605,0 +DA:1609,0 +DA:1610,0 +DA:1611,0 +DA:1617,0 +DA:1618,0 +DA:1619,0 +DA:1620,0 +DA:1623,0 +DA:1624,0 +DA:1625,0 +DA:1626,0 +DA:1627,0 +DA:1628,0 +DA:1629,0 +DA:1631,0 +DA:1632,0 +DA:1633,0 +DA:1634,0 +DA:1640,0 +DA:1641,0 +DA:1642,0 +DA:1643,0 +DA:1644,0 +DA:1647,0 +DA:1651,0 +DA:1652,0 +DA:1656,0 +DA:1661,0 +DA:1663,0 +DA:1669,0 +DA:1670,0 +DA:1671,0 +DA:1672,0 +DA:1675,0 +DA:1676,0 +DA:1677,0 +DA:1678,0 +DA:1679,0 +DA:1681,0 +DA:1682,0 +DA:1684,0 +DA:1687,0 +DA:1689,0 +DA:1690,0 +DA:1691,0 +DA:1697,0 +DA:1698,0 +DA:1699,0 +DA:1700,0 +DA:1702,0 +DA:1703,0 +DA:1705,0 +DA:1709,0 +DA:1712,0 +DA:1713,0 +DA:1714,0 +DA:1718,0 +DA:1722,0 +DA:1725,0 +DA:1728,0 +DA:1729,0 +DA:1730,0 +DA:1731,0 +DA:1732,0 +DA:1733,0 +DA:1734,0 +DA:1740,96 +DA:1743,96 +DA:1744,96 +DA:1745,96 +DA:1746,96 +DA:1747,96 +DA:1748,384 +DA:1749,288 +DA:1751,96 +DA:1752,96 +DA:1754,288 +DA:1755,192 +DA:1756,192 +DA:1758,96 +DA:1759,96 +DA:1761,192 +DA:1762,96 +DA:1763,96 +DA:1765,96 +DA:1766,96 +DA:1767,96 +DA:1769,96 +DA:1770,96 +DA:1776,96 +DA:1780,96 +DA:1781,96 +DA:1782,96 +DA:1783,96 +DA:1784,96 +DA:1785,96 +DA:1786,96 +DA:1787,96 +DA:1788,96 +DA:1794,0 +DA:1798,0 +DA:1799,0 +DA:1800,0 +DA:1801,0 +DA:1802,0 +DA:1804,0 +DA:1805,0 +DA:1806,0 +DA:1807,0 +DA:1808,0 +DA:1809,0 +LF:868 +LH:222 +end_of_record +TN: +SF:/home/jeremy/Dev/libCEED/backends/ref/ceed-ref-qfunction.c +FNL:0,17,45 +FNA:0,2304,CeedQFunctionApply_Ref +FNL:1,50,58 +FNA:1,288,CeedQFunctionDestroy_Ref +FNL:2,63,76 +FNA:2,288,CeedQFunctionCreate_Ref +FNF:3 +FNH:3 +DA:17,2304 +DA:18,2304 +DA:20,2304 +DA:23,2304 +DA:24,2304 +DA:25,2304 +DA:26,2304 +DA:28,6912 +DA:29,4608 +DA:31,4608 +DA:32,2304 +DA:35,2304 +DA:37,6912 +DA:38,4608 +DA:40,4608 +DA:41,2304 +DA:43,2304 +DA:44,2304 +DA:50,288 +DA:53,288 +DA:54,288 +DA:55,288 +DA:56,288 +DA:57,288 +DA:63,288 +DA:67,288 +DA:68,288 +DA:69,288 +DA:70,288 +DA:71,288 +DA:72,288 +DA:73,288 +DA:74,288 +DA:75,288 +LF:34 +LH:34 +end_of_record +TN: +SF:/home/jeremy/Dev/libCEED/backends/ref/ceed-ref-qfunctioncontext.c +FNL:0,103,103 +FNA:0,1188,CeedQFunctionContextRestoreData_Ref +FNL:1,108,115 +FNA:1,216,CeedQFunctionContextDestroy_Ref +FNL:2,120,138 +FNA:2,216,CeedQFunctionContextCreate_Ref +FNL:3,18,24 +FNA:3,1188,CeedQFunctionContextHasValidData_Ref +FNL:4,29,36 +FNA:4,0,CeedQFunctionContextHasBorrowedDataOfType_Ref +FNL:5,41,68 +FNA:5,216,CeedQFunctionContextSetData_Ref +FNL:6,73,84 +FNA:6,0,CeedQFunctionContextTakeData_Ref +FNL:7,89,98 +FNA:7,1188,CeedQFunctionContextGetData_Ref +FNF:8 +FNH:6 +DA:18,1188 +DA:21,1188 +DA:22,1188 +DA:23,1188 +DA:29,0 +DA:32,0 +DA:33,0 +DA:34,0 +DA:35,0 +DA:41,216 +DA:45,216 +DA:46,216 +DA:48,216 +DA:50,216 +DA:51,216 +DA:52,0 +DA:53,0 +DA:54,0 +DA:55,0 +DA:56,0 +DA:57,0 +DA:58,72 +DA:59,72 +DA:60,72 +DA:61,72 +DA:62,72 +DA:63,144 +DA:64,144 +DA:65,144 +DA:67,216 +DA:73,0 +DA:76,0 +DA:78,0 +DA:80,0 +DA:81,0 +DA:82,0 +DA:83,0 +DA:89,1188 +DA:92,1188 +DA:94,1188 +DA:96,1188 +DA:97,1188 +DA:103,1188 +DA:108,216 +DA:111,216 +DA:112,216 +DA:113,216 +DA:114,216 +DA:120,216 +DA:124,216 +DA:125,216 +DA:126,216 +DA:127,216 +DA:128,216 +DA:129,216 +DA:130,216 +DA:131,216 +DA:132,216 +DA:133,216 +DA:134,216 +DA:135,216 +DA:136,216 +DA:137,216 +LF:63 +LH:45 +end_of_record +TN: +SF:/home/jeremy/Dev/libCEED/backends/ref/ceed-ref-restriction.c +FNL:0,138,179 +FNA:0,0,CeedElemRestrictionApplyCurlOrientedUnsignedNoTranspose_Ref_Core +FNL:1,181,218 +FNA:1,816,CeedElemRestrictionApplyStridedTranspose_Ref_Core +FNL:2,19,57 +FNA:2,144,CeedElemRestrictionApplyStridedNoTranspose_Ref_Core +FNL:3,220,242 +FNA:3,816,CeedElemRestrictionApplyOffsetTranspose_Ref_Core +FNL:4,244,266 +FNA:4,0,CeedElemRestrictionApplyOrientedTranspose_Ref_Core +FNL:5,268,317 +FNA:5,0,CeedElemRestrictionApplyCurlOrientedTranspose_Ref_Core +FNL:6,319,369 +FNA:6,0,CeedElemRestrictionApplyCurlOrientedUnsignedTranspose_Ref_Core +FNL:7,371,394 +FNA:7,0,CeedElemRestrictionApplyAtPointsInElement_Ref_Core +FNL:8,396,503 +FNA:8,3408,CeedElemRestrictionApply_Ref_Core +FNL:9,508,512 +FNA:9,2680,CeedElemRestrictionApply_Ref_110 +FNL:10,514,518 +FNA:10,0,CeedElemRestrictionApply_Ref_111 +FNL:11,520,524 +FNA:11,504,CeedElemRestrictionApply_Ref_180 +FNL:12,526,530 +FNA:12,0,CeedElemRestrictionApply_Ref_181 +FNL:13,532,536 +FNA:13,40,CeedElemRestrictionApply_Ref_310 +FNL:14,538,542 +FNA:14,0,CeedElemRestrictionApply_Ref_311 +FNL:15,544,548 +FNA:15,24,CeedElemRestrictionApply_Ref_380 +FNL:16,550,554 +FNA:16,0,CeedElemRestrictionApply_Ref_381 +FNL:17,588,592 +FNA:17,0,CeedElemRestrictionApply_Ref_511 +FNL:18,59,75 +FNA:18,1632,CeedElemRestrictionApplyOffsetNoTranspose_Ref_Core +FNL:19,602,606 +FNA:19,0,CeedElemRestrictionApply_Ref_581 +FNL:20,611,622 +FNA:20,336,CeedElemRestrictionApply_Ref +FNL:21,627,639 +FNA:21,0,CeedElemRestrictionApplyUnsigned_Ref +FNL:22,644,656 +FNA:22,0,CeedElemRestrictionApplyUnoriented_Ref +FNL:23,661,669 +FNA:23,0,CeedElemRestrictionApplyAtPointsInElement_Ref +FNL:24,674,685 +FNA:24,3072,CeedElemRestrictionApplyBlock_Ref +FNL:25,690,699 +FNA:25,360,CeedElemRestrictionGetOffsets_Ref +FNL:26,704,713 +FNA:26,0,CeedElemRestrictionGetOrientations_Ref +FNL:27,718,727 +FNA:27,0,CeedElemRestrictionGetCurlOrientations_Ref +FNL:28,732,741 +FNA:28,816,CeedElemRestrictionDestroy_Ref +FNL:29,746,910 +FNA:29,1032,CeedElemRestrictionCreate_Ref +FNL:30,77,94 +FNA:30,0,CeedElemRestrictionApplyOrientedNoTranspose_Ref_Core +FNL:31,96,136 +FNA:31,0,CeedElemRestrictionApplyCurlOrientedNoTranspose_Ref_Core +FNF:32 +FNH:14 +DA:19,144 +DA:26,144 +DA:27,144 +DA:30,1296 +DA:31,2304 +DA:32,29664 +DA:33,103392 +DA:34,74880 +DA:35,74880 +DA:44,0 +DA:45,0 +DA:46,0 +DA:47,0 +DA:48,0 +DA:49,0 +DA:50,0 +DA:56,144 +DA:59,1632 +DA:66,1632 +DA:67,3936 +DA:68,4968 +DA:69,166824 +DA:70,164160 +DA:74,1632 +DA:77,0 +DA:84,0 +DA:85,0 +DA:86,0 +DA:87,0 +DA:88,0 +DA:89,0 +DA:93,0 +DA:96,0 +DA:103,0 +DA:104,0 +DA:105,0 +DA:106,0 +DA:108,0 +DA:109,0 +DA:110,0 +DA:111,0 +DA:112,0 +DA:113,0 +DA:115,0 +DA:116,0 +DA:117,0 +DA:118,0 +DA:119,0 +DA:120,0 +DA:121,0 +DA:122,0 +DA:123,0 +DA:126,0 +DA:127,0 +DA:128,0 +DA:129,0 +DA:130,0 +DA:131,0 +DA:135,0 +DA:138,0 +DA:146,0 +DA:147,0 +DA:148,0 +DA:149,0 +DA:151,0 +DA:152,0 +DA:153,0 +DA:154,0 +DA:155,0 +DA:156,0 +DA:158,0 +DA:159,0 +DA:160,0 +DA:161,0 +DA:162,0 +DA:163,0 +DA:164,0 +DA:165,0 +DA:166,0 +DA:169,0 +DA:170,0 +DA:171,0 +DA:172,0 +DA:173,0 +DA:174,0 +DA:178,0 +DA:181,816 +DA:188,816 +DA:189,816 +DA:192,1968 +DA:193,2304 +DA:194,29664 +DA:195,72288 +DA:196,43776 +DA:205,0 +DA:206,0 +DA:207,0 +DA:208,0 +DA:209,0 +DA:210,0 +DA:211,0 +DA:217,816 +DA:220,816 +DA:227,816 +DA:228,1968 +DA:229,2304 +DA:230,19872 +DA:232,48000 +DA:235,29280 +DA:236,29280 +DA:241,816 +DA:244,0 +DA:251,0 +DA:252,0 +DA:253,0 +DA:254,0 +DA:256,0 +DA:259,0 +DA:260,0 +DA:265,0 +DA:268,0 +DA:271,0 +DA:274,0 +DA:276,0 +DA:277,0 +DA:278,0 +DA:280,0 +DA:281,0 +DA:283,0 +DA:284,0 +DA:285,0 +DA:286,0 +DA:287,0 +DA:289,0 +DA:290,0 +DA:292,0 +DA:293,0 +DA:294,0 +DA:295,0 +DA:296,0 +DA:297,0 +DA:298,0 +DA:299,0 +DA:301,0 +DA:302,0 +DA:305,0 +DA:306,0 +DA:307,0 +DA:308,0 +DA:309,0 +DA:311,0 +DA:312,0 +DA:316,0 +DA:319,0 +DA:323,0 +DA:326,0 +DA:328,0 +DA:329,0 +DA:330,0 +DA:332,0 +DA:333,0 +DA:335,0 +DA:336,0 +DA:337,0 +DA:338,0 +DA:339,0 +DA:341,0 +DA:342,0 +DA:344,0 +DA:345,0 +DA:346,0 +DA:347,0 +DA:348,0 +DA:349,0 +DA:350,0 +DA:351,0 +DA:353,0 +DA:354,0 +DA:357,0 +DA:358,0 +DA:359,0 +DA:360,0 +DA:361,0 +DA:363,0 +DA:364,0 +DA:368,0 +DA:371,0 +DA:375,0 +DA:378,0 +DA:379,0 +DA:380,0 +DA:381,0 +DA:382,0 +DA:383,0 +DA:384,0 +DA:387,0 +DA:388,0 +DA:391,0 +DA:393,0 +DA:396,3408 +DA:400,3408 +DA:405,3408 +DA:406,3408 +DA:407,3408 +DA:408,3408 +DA:409,3408 +DA:411,3408 +DA:413,1632 +DA:416,1776 +DA:418,3408 +DA:424,1632 +DA:425,816 +DA:426,816 +DA:428,816 +DA:429,816 +DA:430,816 +DA:432,816 +DA:433,0 +DA:434,0 +DA:435,0 +DA:438,0 +DA:441,0 +DA:442,0 +DA:443,0 +DA:444,0 +DA:446,0 +DA:447,0 +DA:450,0 +DA:453,0 +DA:454,0 +DA:455,0 +DA:456,0 +DA:464,1776 +DA:465,144 +DA:466,144 +DA:468,144 +DA:469,1632 +DA:470,1632 +DA:472,1632 +DA:473,0 +DA:474,0 +DA:475,0 +DA:478,0 +DA:481,0 +DA:482,0 +DA:483,0 +DA:484,0 +DA:486,0 +DA:487,0 +DA:490,0 +DA:493,0 +DA:494,0 +DA:495,0 +DA:496,0 +DA:499,3408 +DA:500,3408 +DA:501,3408 +DA:502,3408 +DA:508,2680 +DA:511,2680 +DA:514,0 +DA:517,0 +DA:520,504 +DA:523,504 +DA:526,0 +DA:529,0 +DA:532,40 +DA:535,40 +DA:538,0 +DA:541,0 +DA:544,24 +DA:547,24 +DA:550,0 +DA:553,0 +DA:588,0 +DA:591,0 +DA:602,0 +DA:605,0 +DA:611,336 +DA:615,336 +DA:616,336 +DA:617,336 +DA:618,336 +DA:619,336 +DA:620,336 +DA:621,336 +DA:627,0 +DA:632,0 +DA:633,0 +DA:634,0 +DA:635,0 +DA:636,0 +DA:637,0 +DA:638,0 +DA:644,0 +DA:649,0 +DA:650,0 +DA:651,0 +DA:652,0 +DA:653,0 +DA:654,0 +DA:655,0 +DA:661,0 +DA:666,0 +DA:667,0 +DA:668,0 +DA:674,3072 +DA:679,3072 +DA:680,3072 +DA:681,3072 +DA:682,3072 +DA:683,3072 +DA:684,3072 +DA:690,360 +DA:693,360 +DA:695,360 +DA:697,360 +DA:698,360 +DA:704,0 +DA:707,0 +DA:709,0 +DA:711,0 +DA:712,0 +DA:718,0 +DA:721,0 +DA:723,0 +DA:725,0 +DA:726,0 +DA:732,816 +DA:735,816 +DA:736,816 +DA:737,816 +DA:738,816 +DA:739,816 +DA:740,816 +DA:746,1032 +DA:749,1032 +DA:753,1032 +DA:754,1032 +DA:755,1032 +DA:756,1032 +DA:757,1032 +DA:758,1032 +DA:759,1032 +DA:760,1032 +DA:762,1032 +DA:764,1032 +DA:765,1032 +DA:770,1032 +DA:772,1032 +DA:773,1032 +DA:774,384 +DA:775,384 +DA:776,384 +DA:782,1032 +DA:783,0 +DA:785,0 +DA:786,0 +DA:788,0 +DA:789,0 +DA:792,0 +DA:793,0 +DA:797,1032 +DA:802,648 +DA:804,648 +DA:805,648 +DA:806,648 +DA:808,648 +DA:811,168 +DA:812,34328 +DA:813,34160 +DA:819,648 +DA:820,648 +DA:821,648 +DA:824,648 +DA:825,0 +DA:826,0 +DA:827,648 +DA:828,0 +DA:829,0 +DA:835,1032 +DA:837,1032 +DA:838,1032 +DA:839,544 +DA:840,544 +DA:841,544 +DA:842,0 +DA:843,0 +DA:844,0 +DA:845,312 +DA:846,312 +DA:847,312 +DA:848,0 +DA:849,0 +DA:850,0 +DA:851,64 +DA:852,64 +DA:853,64 +DA:854,0 +DA:855,0 +DA:856,0 +DA:857,24 +DA:858,24 +DA:859,24 +DA:860,0 +DA:861,0 +DA:862,0 +DA:880,0 +DA:881,0 +DA:882,0 +DA:888,0 +DA:889,0 +DA:890,0 +DA:891,88 +DA:892,88 +DA:893,88 +DA:897,1032 +DA:898,1032 +DA:899,1032 +DA:900,1032 +DA:901,0 +DA:903,1032 +DA:904,1032 +DA:905,1032 +DA:906,1032 +DA:907,1032 +DA:908,1032 +DA:909,1032 +LF:428 +LH:162 +end_of_record +TN: +SF:/home/jeremy/Dev/libCEED/backends/ref/ceed-ref-tensor.c +FNL:0,16,38 +FNA:0,4032,CeedTensorContractApply_Ref +FNL:1,43,43 +FNA:1,192,CeedTensorContractDestroy_Ref +FNL:2,48,56 +FNA:2,192,CeedTensorContractCreate_Ref +FNF:3 +FNH:3 +DA:16,4032 +DA:18,4032 +DA:20,4032 +DA:21,1008 +DA:22,1008 +DA:25,4032 +DA:26,830208 +DA:29,26160 +DA:30,145200 +DA:31,839616 +DA:32,716544 +DA:33,5454912 +DA:37,4032 +DA:43,192 +DA:48,192 +DA:51,192 +DA:52,192 +DA:53,192 +DA:54,192 +DA:55,192 +LF:20 +LH:20 +end_of_record +TN: +SF:/home/jeremy/Dev/libCEED/backends/ref/ceed-ref-vector.c +FNL:0,103,110 +FNA:0,8856,CeedVectorGetArrayWrite_Ref +FNL:1,115,115 +FNA:1,10560,CeedVectorRestoreArray_Ref +FNL:2,120,120 +FNA:2,11856,CeedVectorRestoreArrayRead_Ref +FNL:3,125,132 +FNA:3,2784,CeedVectorDestroy_Ref +FNL:4,137,156 +FNA:4,2784,CeedVectorCreate_Ref +FNL:5,18,25 +FNA:5,13560,CeedVectorHasValidArray_Ref +FNL:6,30,37 +FNA:6,0,CeedVectorHasBorrowedArrayOfType_Ref +FNL:7,42,54 +FNA:7,4752,CeedVectorSetArray_Ref +FNL:8,59,70 +FNA:8,0,CeedVectorTakeArray_Ref +FNL:9,75,84 +FNA:9,22416,CeedVectorGetArrayCore_Ref +FNL:10,89,91 +FNA:10,11856,CeedVectorGetArrayRead_Ref +FNL:11,96,98 +FNA:11,1704,CeedVectorGetArray_Ref +FNF:12 +FNH:10 +DA:18,13560 +DA:21,13560 +DA:23,13560 +DA:24,13560 +DA:30,0 +DA:33,0 +DA:34,0 +DA:35,0 +DA:36,0 +DA:42,4752 +DA:46,4752 +DA:47,4752 +DA:49,4752 +DA:51,4752 +DA:53,4752 +DA:59,0 +DA:62,0 +DA:64,0 +DA:66,0 +DA:67,0 +DA:68,0 +DA:69,0 +DA:75,22416 +DA:78,22416 +DA:80,22416 +DA:82,22416 +DA:83,22416 +DA:89,11856 +DA:90,11856 +DA:96,1704 +DA:97,1704 +DA:103,8856 +DA:106,8856 +DA:108,8856 +DA:109,8856 +DA:115,10560 +DA:120,11856 +DA:125,2784 +DA:128,2784 +DA:129,2784 +DA:130,2784 +DA:131,2784 +DA:137,2784 +DA:141,2784 +DA:142,2784 +DA:143,2784 +DA:144,2784 +DA:145,2784 +DA:146,2784 +DA:147,2784 +DA:148,2784 +DA:149,2784 +DA:150,2784 +DA:151,2784 +DA:152,2784 +DA:153,2784 +DA:154,2784 +DA:155,2784 +LF:58 +LH:46 +end_of_record +TN: +SF:/home/jeremy/Dev/libCEED/backends/ref/ceed-ref.c +FNL:0,17,36 +FNA:0,192,CeedInit_Ref +FNL:1,41,46 +FNA:1,192,CeedRegister_Ref +FNF:2 +FNH:2 +DA:17,192 +DA:18,192 +DA:20,192 +DA:22,192 +DA:23,192 +DA:24,192 +DA:25,192 +DA:26,192 +DA:27,192 +DA:28,192 +DA:29,192 +DA:30,192 +DA:31,192 +DA:32,192 +DA:33,192 +DA:34,192 +DA:35,192 +DA:41,192 +DA:44,192 +LF:19 +LH:19 +end_of_record +TN: +SF:/home/jeremy/Dev/libCEED/backends/weak/ceed-backend-weak.c +FNF:0 +FNH:0 +LF:0 +LH:0 +end_of_record +TN: +SF:/home/jeremy/Dev/libCEED/backends/xsmm/ceed-xsmm-blocked.c +FNL:0,18,32 +FNA:0,24,CeedInit_Xsmm_Blocked +FNL:1,37,37 +FNA:1,192,CeedRegister_Xsmm_Blocked +FNF:2 +FNH:2 +DA:18,24 +DA:21,24 +DA:23,24 +DA:26,24 +DA:27,24 +DA:28,24 +DA:30,24 +DA:31,24 +DA:37,192 +LF:9 +LH:9 +end_of_record +TN: +SF:/home/jeremy/Dev/libCEED/backends/xsmm/ceed-xsmm-serial.c +FNL:0,18,32 +FNA:0,24,CeedInit_Xsmm_Serial +FNL:1,37,37 +FNA:1,192,CeedRegister_Xsmm_Serial +FNF:2 +FNH:2 +DA:18,24 +DA:21,24 +DA:23,24 +DA:26,24 +DA:27,24 +DA:28,24 +DA:30,24 +DA:31,24 +DA:37,192 +LF:9 +LH:9 +end_of_record +TN: +SF:/home/jeremy/Dev/libCEED/backends/xsmm/ceed-xsmm-tensor.c +FNL:0,17,63 +FNA:0,2016,CeedTensorContractApply_Xsmm +FNL:1,68,71 +FNA:1,96,CeedTensorContractCreate_Xsmm +FNF:2 +FNH:2 +DA:17,2016 +DA:19,2016 +DA:21,1344 +DA:22,1344 +DA:23,1344 +DA:27,1344 +DA:29,1344 +DA:32,1344 +DA:35,1344 +DA:36,1344 +DA:37,1344 +DA:38,1344 +DA:41,672 +DA:42,672 +DA:43,672 +DA:47,672 +DA:49,672 +DA:52,672 +DA:55,672 +DA:56,4696 +DA:57,4024 +DA:58,4024 +DA:59,4024 +DA:62,2016 +DA:68,96 +DA:69,96 +DA:70,96 +LF:27 +LH:27 +end_of_record +TN: +SF:/home/jeremy/Dev/libCEED/ex1-volumetest/(t*.test/(t*.-f.h)) +FNL:0,2,36 +FNA:0,384,build_mass_ +FNL:1,39,54 +FNA:1,384,apply_mass_ +FNF:2 +FNH:2 +DA:2,384 +DA:14,384 +DA:15,384 +DA:17,672 +DA:19,3648 +DA:20,3360 +DA:24,4752 +DA:25,4680 +DA:29,17664 +DA:32,17304 +DA:35,384 +DA:36,384 +DA:39,384 +DA:50,25344 +DA:51,25344 +DA:53,384 +DA:54,384 +LF:17 +LH:17 +end_of_record +TN: +SF:/home/jeremy/Dev/libCEED/examples/ceed/ex1-volume-f.f90 +FNL:0,157,190 +FNA:0,96,transformmeshcoords_ +FNL:1,193,250 +FNA:1,96,setcartesianmeshcoords_ +FNL:2,253,557 +FNA:2,96,MAIN__ +FNL:3,42,72 +FNA:3,96,getcartesianmeshsize_ +FNL:4,557,557 +FNA:4,96,main +FNL:5,75,154 +FNA:5,192,buildcartesianrestriction_ +FNF:6 +FNH:6 +DA:42,96 +DA:51,96 +DA:52,96 +DA:58,384 +DA:59,288 +DA:60,288 +DA:62,96 +DA:64,288 +DA:65,192 +DA:66,192 +DA:67,64 +DA:68,64 +DA:70,288 +DA:72,96 +DA:75,192 +DA:100,192 +DA:104,192 +DA:105,192 +DA:106,192 +DA:107,192 +DA:108,192 +DA:110,576 +DA:111,384 +DA:112,384 +DA:113,576 +DA:115,192 +DA:119,192 +DA:120,192 +DA:122,2880 +DA:123,2688 +DA:124,2688 +DA:125,2688 +DA:126,2688 +DA:128,6144 +DA:129,3456 +DA:130,6144 +DA:133,41920 +DA:134,39040 +DA:135,39040 +DA:136,39040 +DA:138,122880 +DA:139,83840 +DA:140,83840 +DA:141,122880 +DA:143,41728 +DA:148,192 +DA:149,192 +DA:151,96 +DA:153,192 +DA:154,192 +DA:157,96 +DA:172,96 +DA:173,128 +DA:175,4160 +DA:176,4160 +DA:178,32 +DA:181,12160 +DA:182,12096 +DA:183,12096 +DA:185,12096 +DA:186,12160 +DA:188,160 +DA:190,96 +DA:193,96 +DA:210,96 +DA:211,96 +DA:214,96 +DA:215,96 +DA:217,288 +DA:218,192 +DA:219,288 +DA:222,96 +DA:223,96 +DA:226,96 +DA:227,96 +DA:228,96 +DA:229,96 +DA:230,576 +DA:231,576 +DA:234,16320 +DA:235,16224 +DA:237,51840 +DA:238,35520 +DA:239,35520 +DA:240,51744 +DA:243,96 +DA:245,96 +DA:247,96 +DA:248,96 +DA:249,96 +DA:250,96 +DA:253,96 +DA:267,96 +DA:282,96 +DA:283,96 +DA:284,96 +DA:285,96 +DA:286,96 +DA:287,96 +DA:288,96 +DA:289,96 +DA:290,96 +DA:291,96 +DA:292,96 +DA:296,96 +DA:297,624 +DA:298,528 +DA:300,96 +DA:303,0 +DA:306,96 +DA:309,96 +DA:310,96 +DA:311,96 +DA:314,0 +DA:315,0 +DA:318,0 +DA:319,0 +DA:322,0 +DA:323,0 +DA:326,0 +DA:327,0 +DA:330,0 +DA:331,0 +DA:335,96 +DA:338,528 +DA:343,96 +DA:344,96 +DA:345,96 +DA:347,0 +DA:352,96 +DA:354,0 +DA:355,0 +DA:356,0 +DA:357,0 +DA:358,0 +DA:359,0 +DA:360,0 +DA:361,0 +DA:362,0 +DA:364,0 +DA:366,0 +DA:367,0 +DA:368,0 +DA:370,0 +DA:377,96 +DA:381,96 +DA:382,96 +DA:385,96 +DA:386,96 +DA:388,0 +DA:389,0 +DA:390,0 +DA:392,0 +DA:393,0 +DA:395,0 +DA:401,96 +DA:403,96 +DA:405,96 +DA:407,0 +DA:408,0 +DA:414,96 +DA:415,96 +DA:418,96 +DA:419,96 +DA:420,96 +DA:422,96 +DA:424,96 +DA:427,96 +DA:428,16 +DA:430,16 +DA:433,16 +DA:436,48 +DA:441,48 +DA:442,48 +DA:443,48 +DA:444,48 +DA:445,48 +DA:449,96 +DA:450,96 +DA:451,96 +DA:452,96 +DA:455,96 +DA:456,96 +DA:457,288 +DA:458,288 +DA:460,96 +DA:461,96 +DA:464,96 +DA:465,48 +DA:469,48 +DA:470,48 +DA:471,48 +DA:472,48 +DA:476,96 +DA:477,96 +DA:478,96 +DA:479,96 +DA:482,96 +DA:483,96 +DA:485,96 +DA:486,96 +DA:487,96 +DA:488,96 +DA:489,96 +DA:490,96 +DA:493,96 +DA:496,96 +DA:499,96 +DA:501,0 +DA:504,96 +DA:506,96 +DA:511,96 +DA:513,96 +DA:514,16320 +DA:515,16320 +DA:517,96 +DA:519,96 +DA:521,0 +DA:522,0 +DA:523,0 +DA:524,0 +DA:527,96 +DA:528,32 +DA:530,0 +DA:534,64 +DA:536,0 +DA:543,96 +DA:544,96 +DA:545,96 +DA:546,96 +DA:547,96 +DA:548,96 +DA:549,96 +DA:550,96 +DA:551,96 +DA:552,96 +DA:553,96 +DA:554,96 +DA:555,96 +DA:556,96 +DA:557,96 +LF:241 +LH:200 +end_of_record +TN: +SF:/home/jeremy/Dev/libCEED/examples/ceed/ex1-volume.c +FNL:0,294,316 +FNA:0,96,GetCartesianMeshSize +FNL:1,318,363 +FNA:1,192,BuildCartesianRestriction +FNL:2,365,394 +FNA:2,96,SetCartesianMeshCoords +FNL:3,401,429 +FNA:3,96,TransformMeshCoords +FNL:4,55,292 +FNA:4,96,main +FNF:5 +FNH:5 +DA:55,96 +DA:56,96 +DA:57,96 +DA:58,96 +DA:59,96 +DA:60,96 +DA:61,96 +DA:62,96 +DA:63,96 +DA:66,432 +DA:97,96 +DA:100,96 +DA:121,96 +DA:126,96 +DA:127,96 +DA:130,96 +DA:132,96 +DA:133,96 +DA:146,96 +DA:147,96 +DA:148,96 +DA:158,96 +DA:159,96 +DA:162,96 +DA:168,96 +DA:169,96 +DA:170,96 +DA:175,96 +DA:177,48 +DA:178,48 +DA:179,48 +DA:182,48 +DA:183,48 +DA:184,48 +DA:185,48 +DA:186,48 +DA:192,96 +DA:193,96 +DA:194,96 +DA:195,96 +DA:199,96 +DA:200,96 +DA:202,288 +DA:203,96 +DA:204,96 +DA:209,96 +DA:211,48 +DA:214,48 +DA:215,48 +DA:216,48 +DA:217,48 +DA:223,96 +DA:224,96 +DA:225,96 +DA:226,96 +DA:231,96 +DA:232,96 +DA:235,96 +DA:238,96 +DA:241,96 +DA:246,96 +DA:253,96 +DA:258,96 +DA:259,16320 +DA:260,96 +DA:262,96 +DA:270,96 +DA:272,96 +DA:276,96 +DA:277,96 +DA:278,96 +DA:279,96 +DA:280,96 +DA:281,96 +DA:282,96 +DA:283,96 +DA:284,96 +DA:285,96 +DA:286,96 +DA:287,96 +DA:288,96 +DA:289,96 +DA:290,96 +DA:291,96 +DA:294,96 +DA:297,96 +DA:298,96 +DA:300,384 +DA:301,288 +DA:302,288 +DA:304,96 +DA:306,288 +DA:307,192 +DA:309,192 +DA:310,64 +DA:311,64 +DA:313,192 +DA:315,96 +DA:318,192 +DA:319,192 +DA:320,192 +DA:321,192 +DA:322,192 +DA:323,192 +DA:325,576 +DA:326,384 +DA:327,384 +DA:328,384 +DA:330,192 +DA:334,192 +DA:336,2880 +DA:337,2688 +DA:339,6144 +DA:340,3456 +DA:341,3456 +DA:343,2688 +DA:345,41728 +DA:346,39040 +DA:348,122880 +DA:349,83840 +DA:350,83840 +DA:351,83840 +DA:353,39040 +DA:356,192 +DA:358,192 +DA:359,96 +DA:361,192 +DA:362,192 +DA:365,96 +DA:366,96 +DA:367,96 +DA:369,288 +DA:370,192 +DA:371,192 +DA:375,96 +DA:376,96 +DA:379,96 +DA:380,576 +DA:381,16320 +DA:382,16224 +DA:384,51744 +DA:385,35520 +DA:387,35520 +DA:388,35520 +DA:391,96 +DA:392,96 +DA:393,96 +DA:401,96 +DA:405,96 +DA:406,96 +DA:407,4160 +DA:409,4128 +DA:411,32 +DA:413,64 +DA:415,12160 +DA:418,12096 +DA:420,12096 +DA:421,12096 +DA:422,12096 +DA:423,12096 +DA:425,64 +DA:427,96 +DA:428,96 +LF:163 +LH:163 +end_of_record +TN: +SF:/home/jeremy/Dev/libCEED/examples/ceed/ex1-volume.h +FNL:0,16,52 +FNA:0,384,build_mass +FNL:1,55,64 +FNA:1,384,apply_mass +FNF:2 +FNH:2 +DA:16,384 +DA:17,384 +DA:21,384 +DA:22,384 +DA:24,384 +DA:25,288 +DA:26,288 +DA:29,3360 +DA:30,288 +DA:31,72 +DA:32,72 +DA:35,4680 +DA:36,4608 +DA:38,72 +DA:39,24 +DA:40,24 +DA:43,17304 +DA:44,17280 +DA:45,17280 +DA:46,17280 +DA:47,17280 +DA:49,24 +DA:51,384 +DA:55,384 +DA:58,384 +DA:59,384 +DA:62,25344 +DA:63,384 +LF:28 +LH:28 +end_of_record +TN: +SF:/home/jeremy/Dev/libCEED/gallery/ceed-gallery-list.h +FNF:0 +FNH:0 +DA:15,96 +DA:16,96 +DA:17,96 +DA:18,96 +DA:19,96 +DA:20,96 +DA:21,96 +DA:22,96 +DA:23,96 +DA:24,96 +DA:25,96 +DA:26,96 +DA:27,96 +DA:28,96 +DA:29,96 +DA:30,96 +LF:16 +LH:16 +end_of_record +TN: +SF:/home/jeremy/Dev/libCEED/gallery/identity/ceed-identity.c +FNL:0,17,36 +FNA:0,0,CeedQFunctionInit_Identity +FNL:1,41,43 +FNA:1,96,CeedQFunctionRegister_Identity +FNF:2 +FNH:1 +DA:17,0 +DA:19,0 +DA:20,0 +DA:24,0 +DA:28,0 +DA:29,0 +DA:30,0 +DA:31,0 +DA:32,0 +DA:33,0 +DA:35,0 +DA:41,96 +DA:42,96 +LF:13 +LH:2 +end_of_record +TN: +SF:/home/jeremy/Dev/libCEED/gallery/mass-vector/ceed-vectormassapply.c +FNL:0,16,30 +FNA:0,0,CeedQFunctionInit_Vector3MassApply +FNL:1,35,37 +FNA:1,96,CeedQFunctionRegister_Vector3MassApply +FNF:2 +FNH:1 +DA:16,0 +DA:18,0 +DA:19,0 +DA:22,0 +DA:23,0 +DA:24,0 +DA:25,0 +DA:27,0 +DA:29,0 +DA:35,96 +DA:36,96 +LF:11 +LH:2 +end_of_record +TN: +SF:/home/jeremy/Dev/libCEED/gallery/mass/ceed-mass1dbuild.c +FNL:0,16,30 +FNA:0,32,CeedQFunctionInit_Mass1DBuild +FNL:1,35,37 +FNA:1,96,CeedQFunctionRegister_Mass1DBuild +FNF:2 +FNH:2 +DA:16,32 +DA:18,32 +DA:19,32 +DA:22,32 +DA:23,32 +DA:24,32 +DA:25,32 +DA:27,32 +DA:29,32 +DA:35,96 +DA:36,96 +LF:11 +LH:11 +end_of_record +TN: +SF:/home/jeremy/Dev/libCEED/gallery/mass/ceed-mass2dbuild.c +FNL:0,16,30 +FNA:0,32,CeedQFunctionInit_Mass2DBuild +FNL:1,35,37 +FNA:1,96,CeedQFunctionRegister_Mass2DBuild +FNF:2 +FNH:2 +DA:16,32 +DA:18,32 +DA:19,32 +DA:22,32 +DA:23,32 +DA:24,32 +DA:25,32 +DA:27,32 +DA:29,32 +DA:35,96 +DA:36,96 +LF:11 +LH:11 +end_of_record +TN: +SF:/home/jeremy/Dev/libCEED/gallery/mass/ceed-mass3dbuild.c +FNL:0,16,30 +FNA:0,32,CeedQFunctionInit_Mass3DBuild +FNL:1,35,37 +FNA:1,96,CeedQFunctionRegister_Mass3DBuild +FNF:2 +FNH:2 +DA:16,32 +DA:18,32 +DA:19,32 +DA:22,32 +DA:23,32 +DA:24,32 +DA:25,32 +DA:27,32 +DA:29,32 +DA:35,96 +DA:36,96 +LF:11 +LH:11 +end_of_record +TN: +SF:/home/jeremy/Dev/libCEED/gallery/mass/ceed-massapply.c +FNL:0,16,29 +FNA:0,96,CeedQFunctionInit_MassApply +FNL:1,34,36 +FNA:1,96,CeedQFunctionRegister_MassApply +FNF:2 +FNH:2 +DA:16,96 +DA:18,96 +DA:19,96 +DA:22,96 +DA:23,96 +DA:24,96 +DA:26,96 +DA:28,96 +DA:34,96 +DA:35,96 +LF:10 +LH:10 +end_of_record +TN: +SF:/home/jeremy/Dev/libCEED/gallery/poisson-vector/ceed-vectorpoisson1dapply.c +FNL:0,16,30 +FNA:0,0,CeedQFunctionInit_Vector3Poisson1DApply +FNL:1,35,37 +FNA:1,96,CeedQFunctionRegister_Vector3Poisson1DApply +FNF:2 +FNH:1 +DA:16,0 +DA:18,0 +DA:19,0 +DA:22,0 +DA:23,0 +DA:24,0 +DA:25,0 +DA:27,0 +DA:29,0 +DA:35,96 +DA:36,96 +LF:11 +LH:2 +end_of_record +TN: +SF:/home/jeremy/Dev/libCEED/gallery/poisson-vector/ceed-vectorpoisson2dapply.c +FNL:0,16,30 +FNA:0,0,CeedQFunctionInit_Vector3Poisson2DApply +FNL:1,35,37 +FNA:1,96,CeedQFunctionRegister_Vector3Poisson2DApply +FNF:2 +FNH:1 +DA:16,0 +DA:18,0 +DA:19,0 +DA:22,0 +DA:23,0 +DA:24,0 +DA:25,0 +DA:27,0 +DA:29,0 +DA:35,96 +DA:36,96 +LF:11 +LH:2 +end_of_record +TN: +SF:/home/jeremy/Dev/libCEED/gallery/poisson-vector/ceed-vectorpoisson3dapply.c +FNL:0,16,30 +FNA:0,0,CeedQFunctionInit_Vector3Poisson3DApply +FNL:1,35,37 +FNA:1,96,CeedQFunctionRegister_Vector3Poisson3DApply +FNF:2 +FNH:1 +DA:16,0 +DA:18,0 +DA:19,0 +DA:22,0 +DA:23,0 +DA:24,0 +DA:25,0 +DA:27,0 +DA:29,0 +DA:35,96 +DA:36,96 +LF:11 +LH:2 +end_of_record +TN: +SF:/home/jeremy/Dev/libCEED/gallery/poisson/ceed-poisson1dapply.c +FNL:0,16,30 +FNA:0,0,CeedQFunctionInit_Poisson1DApply +FNL:1,35,37 +FNA:1,96,CeedQFunctionRegister_Poisson1DApply +FNF:2 +FNH:1 +DA:16,0 +DA:18,0 +DA:19,0 +DA:22,0 +DA:23,0 +DA:24,0 +DA:25,0 +DA:27,0 +DA:29,0 +DA:35,96 +DA:36,96 +LF:11 +LH:2 +end_of_record +TN: +SF:/home/jeremy/Dev/libCEED/gallery/poisson/ceed-poisson1dbuild.c +FNL:0,16,30 +FNA:0,0,CeedQFunctionInit_Poisson1DBuild +FNL:1,35,37 +FNA:1,96,CeedQFunctionRegister_Poisson1DBuild +FNF:2 +FNH:1 +DA:16,0 +DA:18,0 +DA:19,0 +DA:22,0 +DA:23,0 +DA:24,0 +DA:25,0 +DA:27,0 +DA:29,0 +DA:35,96 +DA:36,96 +LF:11 +LH:2 +end_of_record +TN: +SF:/home/jeremy/Dev/libCEED/gallery/poisson/ceed-poisson2dapply.c +FNL:0,16,30 +FNA:0,0,CeedQFunctionInit_Poisson2DApply +FNL:1,35,37 +FNA:1,96,CeedQFunctionRegister_Poisson2DApply +FNF:2 +FNH:1 +DA:16,0 +DA:18,0 +DA:19,0 +DA:22,0 +DA:23,0 +DA:24,0 +DA:25,0 +DA:27,0 +DA:29,0 +DA:35,96 +DA:36,96 +LF:11 +LH:2 +end_of_record +TN: +SF:/home/jeremy/Dev/libCEED/gallery/poisson/ceed-poisson2dbuild.c +FNL:0,16,30 +FNA:0,0,CeedQFunctionInit_Poisson2DBuild +FNL:1,35,37 +FNA:1,96,CeedQFunctionRegister_Poisson2DBuild +FNF:2 +FNH:1 +DA:16,0 +DA:18,0 +DA:19,0 +DA:22,0 +DA:23,0 +DA:24,0 +DA:25,0 +DA:27,0 +DA:29,0 +DA:35,96 +DA:36,96 +LF:11 +LH:2 +end_of_record +TN: +SF:/home/jeremy/Dev/libCEED/gallery/poisson/ceed-poisson3dapply.c +FNL:0,16,30 +FNA:0,0,CeedQFunctionInit_Poisson3DApply +FNL:1,35,37 +FNA:1,96,CeedQFunctionRegister_Poisson3DApply +FNF:2 +FNH:1 +DA:16,0 +DA:18,0 +DA:19,0 +DA:22,0 +DA:23,0 +DA:24,0 +DA:25,0 +DA:27,0 +DA:29,0 +DA:35,96 +DA:36,96 +LF:11 +LH:2 +end_of_record +TN: +SF:/home/jeremy/Dev/libCEED/gallery/poisson/ceed-poisson3dbuild.c +FNL:0,16,30 +FNA:0,0,CeedQFunctionInit_Poisson3DBuild +FNL:1,35,37 +FNA:1,96,CeedQFunctionRegister_Poisson3DBuild +FNF:2 +FNH:1 +DA:16,0 +DA:18,0 +DA:19,0 +DA:22,0 +DA:23,0 +DA:24,0 +DA:25,0 +DA:27,0 +DA:29,0 +DA:35,96 +DA:36,96 +LF:11 +LH:2 +end_of_record +TN: +SF:/home/jeremy/Dev/libCEED/gallery/scale/ceed-scale.c +FNL:0,16,24 +FNA:0,0,CeedQFunctionInit_Scale +FNL:1,29,29 +FNA:1,96,CeedQFunctionRegister_Scale +FNF:2 +FNH:1 +DA:16,0 +DA:18,0 +DA:19,0 +DA:23,0 +DA:29,96 +LF:5 +LH:1 +end_of_record +TN: +SF:/home/jeremy/Dev/libCEED/include/ceed/ceed.h +FNL:0,525,533 +FNA:0,14400,CeedIntPow +FNL:1,545,545 +FNA:1,420064,CeedIntMin +FNL:2,557,557 +FNA:2,0,CeedIntMax +FNF:3 +FNH:2 +DA:525,14400 +DA:526,14400 +DA:527,20992 +DA:528,6592 +DA:529,6592 +DA:530,6592 +DA:532,14400 +DA:545,420064 +DA:557,0 +LF:9 +LH:8 +end_of_record +TN: +SF:/home/jeremy/Dev/libCEED/include/ceed/jit-source/gallery/ceed-identity.h +FNL:0,17,31 +FNA:0,0,Identity +FNF:1 +FNH:0 +DA:17,0 +DA:19,0 +DA:20,0 +DA:23,0 +DA:25,0 +DA:28,0 +DA:30,0 +LF:7 +LH:0 +end_of_record +TN: +SF:/home/jeremy/Dev/libCEED/include/ceed/jit-source/gallery/ceed-mass1dbuild.h +FNL:0,13,24 +FNA:0,576,Mass1DBuild +FNF:1 +FNH:1 +DA:13,576 +DA:16,576 +DA:18,576 +DA:21,6720 +DA:23,576 +LF:5 +LH:5 +end_of_record +TN: +SF:/home/jeremy/Dev/libCEED/include/ceed/jit-source/gallery/ceed-mass2dbuild.h +FNL:0,13,26 +FNA:0,144,Mass2DBuild +FNF:1 +FNH:1 +DA:13,144 +DA:16,144 +DA:18,144 +DA:21,9360 +DA:22,9216 +DA:25,144 +LF:6 +LH:6 +end_of_record +TN: +SF:/home/jeremy/Dev/libCEED/include/ceed/jit-source/gallery/ceed-mass3dbuild.h +FNL:0,13,28 +FNA:0,48,Mass3DBuild +FNF:1 +FNH:1 +DA:13,48 +DA:16,48 +DA:18,48 +DA:21,34608 +DA:22,34560 +DA:23,34560 +DA:24,34560 +DA:27,48 +LF:8 +LH:8 +end_of_record +TN: +SF:/home/jeremy/Dev/libCEED/include/ceed/jit-source/gallery/ceed-massapply.h +FNL:0,13,24 +FNA:0,768,MassApply +FNF:1 +FNH:1 +DA:13,768 +DA:16,768 +DA:18,768 +DA:21,50688 +DA:23,768 +LF:5 +LH:5 +end_of_record +TN: +SF:/home/jeremy/Dev/libCEED/include/ceed/jit-source/gallery/ceed-poisson1dapply.h +FNL:0,13,25 +FNA:0,0,Poisson1DApply +FNF:1 +FNH:0 +DA:13,0 +DA:16,0 +DA:19,0 +DA:22,0 +DA:24,0 +LF:5 +LH:0 +end_of_record +TN: +SF:/home/jeremy/Dev/libCEED/include/ceed/jit-source/gallery/ceed-poisson1dbuild.h +FNL:0,13,28 +FNA:0,0,Poisson1DBuild +FNF:1 +FNH:0 +DA:13,0 +DA:19,0 +DA:22,0 +DA:25,0 +DA:27,0 +LF:5 +LH:0 +end_of_record +TN: +SF:/home/jeremy/Dev/libCEED/include/ceed/jit-source/gallery/ceed-poisson2dapply.h +FNL:0,13,39 +FNA:0,0,Poisson2DApply +FNF:1 +FNH:0 +DA:13,0 +DA:16,0 +DA:18,0 +DA:20,0 +DA:23,0 +DA:28,0 +DA:29,0 +DA:30,0 +DA:35,0 +DA:38,0 +LF:10 +LH:0 +end_of_record +TN: +SF:/home/jeremy/Dev/libCEED/include/ceed/jit-source/gallery/ceed-poisson2dbuild.h +FNL:0,13,38 +FNA:0,0,Poisson2DBuild +FNF:1 +FNH:0 +DA:13,0 +DA:18,0 +DA:20,0 +DA:23,0 +DA:27,0 +DA:28,0 +DA:29,0 +DA:30,0 +DA:31,0 +DA:32,0 +DA:33,0 +DA:34,0 +DA:37,0 +LF:13 +LH:0 +end_of_record +TN: +SF:/home/jeremy/Dev/libCEED/include/ceed/jit-source/gallery/ceed-poisson3dapply.h +FNL:0,13,41 +FNA:0,0,Poisson3DApply +FNF:1 +FNH:0 +DA:13,0 +DA:16,0 +DA:18,0 +DA:20,0 +DA:23,0 +DA:29,0 +DA:30,0 +DA:31,0 +DA:32,0 +DA:37,0 +DA:40,0 +LF:11 +LH:0 +end_of_record +TN: +SF:/home/jeremy/Dev/libCEED/include/ceed/jit-source/gallery/ceed-poisson3dbuild.h +FNL:0,13,51 +FNA:0,0,Poisson3DBuild +FNF:1 +FNH:0 +DA:13,0 +DA:17,0 +DA:19,0 +DA:21,0 +DA:24,0 +DA:27,0 +DA:28,0 +DA:31,0 +DA:32,0 +DA:35,0 +DA:42,0 +DA:43,0 +DA:44,0 +DA:45,0 +DA:46,0 +DA:47,0 +DA:50,0 +LF:17 +LH:0 +end_of_record +TN: +SF:/home/jeremy/Dev/libCEED/include/ceed/jit-source/gallery/ceed-scale.h +FNL:0,13,27 +FNA:0,0,Scale +FNF:1 +FNH:0 +DA:13,0 +DA:15,0 +DA:19,0 +DA:20,0 +DA:22,0 +DA:25,0 +DA:26,0 +LF:7 +LH:0 +end_of_record +TN: +SF:/home/jeremy/Dev/libCEED/include/ceed/jit-source/gallery/ceed-vectormassapply.h +FNL:0,13,30 +FNA:0,0,Vector3MassApply +FNF:1 +FNH:0 +DA:13,0 +DA:16,0 +DA:18,0 +DA:20,0 +DA:23,0 +DA:24,0 +DA:25,0 +DA:29,0 +LF:8 +LH:0 +end_of_record +TN: +SF:/home/jeremy/Dev/libCEED/include/ceed/jit-source/gallery/ceed-vectorpoisson1dapply.h +FNL:0,13,30 +FNA:0,0,Vector3Poisson1DApply +FNF:1 +FNH:0 +DA:13,0 +DA:16,0 +DA:18,0 +DA:20,0 +DA:23,0 +DA:24,0 +DA:25,0 +DA:29,0 +LF:8 +LH:0 +end_of_record +TN: +SF:/home/jeremy/Dev/libCEED/include/ceed/jit-source/gallery/ceed-vectorpoisson2dapply.h +FNL:0,13,40 +FNA:0,0,Vector3Poisson2DApply +FNF:1 +FNH:0 +DA:13,0 +DA:16,0 +DA:18,0 +DA:20,0 +DA:23,0 +DA:28,0 +DA:29,0 +DA:30,0 +DA:35,0 +DA:36,0 +DA:39,0 +LF:11 +LH:0 +end_of_record +TN: +SF:/home/jeremy/Dev/libCEED/include/ceed/jit-source/gallery/ceed-vectorpoisson3dapply.h +FNL:0,13,43 +FNA:0,0,Vector3Poisson3DApply +FNF:1 +FNH:0 +DA:13,0 +DA:16,0 +DA:18,0 +DA:20,0 +DA:23,0 +DA:29,0 +DA:30,0 +DA:31,0 +DA:32,0 +DA:37,0 +DA:38,0 +DA:39,0 +DA:42,0 +LF:13 +LH:0 +end_of_record +TN: +SF:/home/jeremy/Dev/libCEED/interface/ceed-basis.c +FNL:0,1078,1081 +FNA:0,6528,CeedBasisGetFESpace +FNL:1,1093,1096 +FNA:1,0,CeedBasisGetTopologyDimension +FNL:2,1108,1111 +FNA:2,4800,CeedBasisGetTensorContract +FNL:3,1123,1127 +FNA:3,384,CeedBasisSetTensorContract +FNL:4,1146,1156 +FNA:4,384,CeedMatrixMatrixMultiply +FNL:5,1171,1207 +FNA:5,384,CeedQRFactorization +FNL:6,1228,1241 +FNA:6,384,CeedHouseholderApplyQ +FNL:7,1256,1285 +FNA:7,384,CeedMatrixPseudoinverse +FNL:8,129,146 +FNA:8,0,CeedGivensRotation +FNL:9,1300,1425 +FNA:9,0,CeedSymmetricSchurDecomposition +FNL:10,1447,1507 +FNA:10,0,CeedSimultaneousDiagonalization +FNL:11,1536,1576 +FNA:11,864,CeedBasisCreateTensorH1 +FNL:12,1594,1653 +FNA:12,384,CeedBasisCreateTensorH1Lagrange +FNL:13,163,178 +FNA:13,0,CeedScalarView +FNL:14,1673,1712 +FNA:14,0,CeedBasisCreateH1 +FNL:15,1732,1771 +FNA:15,0,CeedBasisCreateHdiv +FNL:16,1791,1831 +FNA:16,0,CeedBasisCreateHcurl +FNL:17,1856,1899 +FNA:17,0,CeedBasisCreateProjection +FNL:18,190,193 +FNA:18,0,CeedBasisView_Object +FNL:19,1914,1919 +FNA:19,6528,CeedBasisReferenceCopy +FNL:20,1931,1934 +FNA:20,0,CeedBasisSetNumViewTabs +FNL:21,1946,1949 +FNA:21,0,CeedBasisGetNumViewTabs +FNL:22,1961,2037 +FNA:22,0,CeedBasisView +FNL:23,204,207 +FNA:23,0,CeedBasisDestroy_Object +FNL:24,2060,2065 +FNA:24,4800,CeedBasisApply +FNL:25,2088,2094 +FNA:25,0,CeedBasisApplyAdd +FNL:26,2116,2125 +FNA:26,0,CeedBasisApplyAtPoints +FNL:27,2147,2157 +FNA:27,0,CeedBasisApplyAddAtPoints +FNL:28,2169,2172 +FNA:28,768,CeedBasisGetCeed +FNL:29,2183,2183 +FNA:29,0,CeedBasisReturnCeed +FNL:30,2195,2198 +FNA:30,20736,CeedBasisGetDimension +FNL:31,2210,2213 +FNA:31,0,CeedBasisGetTopology +FNL:32,2225,2228 +FNA:32,13248,CeedBasisGetNumComponents +FNL:33,2240,2243 +FNA:33,10176,CeedBasisGetNumNodes +FNL:34,2255,2259 +FNA:34,5184,CeedBasisGetNumNodes1D +FNL:35,226,322 +FNA:35,0,CeedBasisCreateProjectionMatrices +FNL:36,2271,2274 +FNA:36,10368,CeedBasisGetNumQuadraturePoints +FNL:37,2286,2290 +FNA:37,5184,CeedBasisGetNumQuadraturePoints1D +FNL:38,2302,2305 +FNA:38,0,CeedBasisGetQRef +FNL:39,2317,2320 +FNA:39,192,CeedBasisGetQWeights +FNL:40,2332,2354 +FNA:40,0,CeedBasisGetInterp +FNL:41,2366,2373 +FNA:41,4992,CeedBasisGetInterp1D +FNL:42,2385,2410 +FNA:42,0,CeedBasisGetGrad +FNL:43,2422,2429 +FNA:43,384,CeedBasisGetGrad1D +FNL:44,2441,2444 +FNA:44,0,CeedBasisGetDiv +FNL:45,2456,2459 +FNA:45,0,CeedBasisGetCurl +FNL:46,2470,2490 +FNA:46,13440,CeedBasisDestroy +FNL:47,2503,2542 +FNA:47,384,CeedGaussQuadrature +FNL:48,2555,2608 +FNA:48,576,CeedLobattoQuadrature +FNL:49,345,376 +FNA:49,4800,CeedBasisApplyCheckDims +FNL:50,398,453 +FNA:50,0,CeedBasisApplyAtPointsCheckDims +FNL:51,476,677 +FNA:51,0,CeedBasisApplyAtPoints_Core +FNL:52,48,53 +FNA:52,0,CeedChebyshevPolynomialsAtPoint +FNL:53,66,80 +FNA:53,0,CeedChebyshevDerivativeAtPoint +FNL:54,708,721 +FNA:54,0,CeedBasisCreateH1Fallback +FNL:55,733,754 +FNA:55,384,CeedBasisGetCollocatedGrad +FNL:56,766,796 +FNA:56,0,CeedBasisGetChebyshevInterp1D +FNL:57,808,811 +FNA:57,10176,CeedBasisIsTensor +FNL:58,823,837 +FNA:58,384,CeedBasisIsCollocated +FNL:59,849,852 +FNA:59,5184,CeedBasisGetData +FNL:60,864,867 +FNA:60,384,CeedBasisSetData +FNL:61,878,881 +FNA:61,6144,CeedBasisReference +FNL:62,897,923 +FNA:62,10368,CeedBasisGetNumQuadratureComponents +FNL:63,937,1066 +FNA:63,0,CeedBasisGetFlopsEstimate +FNL:64,99,108 +FNA:64,3840,CeedHouseholderReflect +FNF:65 +FNH:33 +DA:48,0 +DA:49,0 +DA:50,0 +DA:51,0 +DA:52,0 +DA:66,0 +DA:69,0 +DA:70,0 +DA:71,0 +DA:72,0 +DA:73,0 +DA:74,0 +DA:75,0 +DA:76,0 +DA:77,0 +DA:79,0 +DA:99,3840 +DA:100,19200 +DA:101,15360 +DA:103,65280 +DA:104,15360 +DA:105,65280 +DA:107,3840 +DA:129,0 +DA:130,0 +DA:132,0 +DA:133,0 +DA:134,0 +DA:135,0 +DA:139,0 +DA:140,0 +DA:142,0 +DA:143,0 +DA:145,0 +DA:163,0 +DA:164,0 +DA:165,0 +DA:169,0 +DA:170,0 +DA:172,0 +DA:173,0 +DA:174,0 +DA:175,0 +DA:177,0 +DA:190,0 +DA:191,0 +DA:192,0 +DA:204,0 +DA:205,0 +DA:206,0 +DA:226,0 +DA:231,0 +DA:232,0 +DA:233,0 +DA:237,0 +DA:243,0 +DA:244,0 +DA:245,0 +DA:247,0 +DA:248,0 +DA:249,0 +DA:250,0 +DA:252,0 +DA:253,0 +DA:259,0 +DA:260,0 +DA:261,0 +DA:267,0 +DA:269,0 +DA:271,0 +DA:272,0 +DA:273,0 +DA:274,0 +DA:276,0 +DA:277,0 +DA:278,0 +DA:280,0 +DA:281,0 +DA:286,0 +DA:287,0 +DA:288,0 +DA:290,0 +DA:293,0 +DA:296,0 +DA:297,0 +DA:299,0 +DA:300,0 +DA:302,0 +DA:303,0 +DA:304,0 +DA:305,0 +DA:306,0 +DA:308,0 +DA:310,0 +DA:311,0 +DA:313,0 +DA:314,0 +DA:319,0 +DA:320,0 +DA:321,0 +DA:345,4800 +DA:347,4800 +DA:349,4800 +DA:350,4800 +DA:351,4800 +DA:352,4800 +DA:353,4800 +DA:354,4800 +DA:355,4800 +DA:358,4800 +DA:359,4800 +DA:360,4608 +DA:365,1536 +DA:366,9216 +DA:367,3072 +DA:368,3072 +DA:369,4608 +DA:370,192 +DA:371,192 +DA:372,192 +DA:374,4800 +DA:375,4800 +DA:398,0 +DA:400,0 +DA:401,0 +DA:403,0 +DA:404,0 +DA:405,0 +DA:406,0 +DA:407,0 +DA:408,0 +DA:409,0 +DA:410,0 +DA:411,0 +DA:414,0 +DA:415,0 +DA:422,0 +DA:426,0 +DA:427,0 +DA:428,0 +DA:429,0 +DA:430,0 +DA:431,0 +DA:432,0 +DA:433,0 +DA:434,0 +DA:435,0 +DA:436,0 +DA:437,0 +DA:438,0 +DA:439,0 +DA:440,0 +DA:441,0 +DA:442,0 +DA:451,0 +DA:452,0 +DA:476,0 +DA:478,0 +DA:480,0 +DA:482,0 +DA:483,0 +DA:484,0 +DA:485,0 +DA:491,0 +DA:492,0 +DA:495,0 +DA:497,0 +DA:498,0 +DA:499,0 +DA:501,0 +DA:507,0 +DA:508,0 +DA:509,0 +DA:510,0 +DA:511,0 +DA:513,0 +DA:514,0 +DA:515,0 +DA:519,0 +DA:520,0 +DA:521,0 +DA:522,0 +DA:526,0 +DA:528,0 +DA:530,0 +DA:532,0 +DA:534,0 +DA:536,0 +DA:537,0 +DA:538,0 +DA:542,0 +DA:543,0 +DA:549,0 +DA:552,0 +DA:553,0 +DA:554,0 +DA:555,0 +DA:556,0 +DA:557,0 +DA:560,0 +DA:561,0 +DA:563,0 +DA:565,0 +DA:566,0 +DA:568,0 +DA:569,0 +DA:571,0 +DA:573,0 +DA:575,0 +DA:576,0 +DA:579,0 +DA:581,0 +DA:582,0 +DA:584,0 +DA:586,0 +DA:587,0 +DA:588,0 +DA:590,0 +DA:591,0 +DA:593,0 +DA:596,0 +DA:598,0 +DA:600,0 +DA:602,0 +DA:603,0 +DA:604,0 +DA:605,0 +DA:607,0 +DA:614,0 +DA:615,0 +DA:616,0 +DA:618,0 +DA:619,0 +DA:620,0 +DA:623,0 +DA:624,0 +DA:626,0 +DA:627,0 +DA:629,0 +DA:630,0 +DA:632,0 +DA:633,0 +DA:636,0 +DA:638,0 +DA:639,0 +DA:642,0 +DA:644,0 +DA:645,0 +DA:647,0 +DA:648,0 +DA:650,0 +DA:651,0 +DA:652,0 +DA:655,0 +DA:656,0 +DA:660,0 +DA:662,0 +DA:664,0 +DA:666,0 +DA:667,0 +DA:668,0 +DA:671,0 +DA:672,0 +DA:673,0 +DA:676,0 +DA:708,0 +DA:710,0 +DA:713,0 +DA:714,0 +DA:716,0 +DA:717,0 +DA:718,0 +DA:719,0 +DA:720,0 +DA:733,384 +DA:740,384 +DA:741,384 +DA:742,384 +DA:745,384 +DA:746,384 +DA:747,384 +DA:748,384 +DA:749,384 +DA:751,384 +DA:752,384 +DA:753,384 +DA:766,0 +DA:772,0 +DA:773,0 +DA:774,0 +DA:778,0 +DA:779,0 +DA:780,0 +DA:781,0 +DA:784,0 +DA:785,0 +DA:788,0 +DA:789,0 +DA:792,0 +DA:793,0 +DA:794,0 +DA:795,0 +DA:808,10176 +DA:809,10176 +DA:810,10176 +DA:823,384 +DA:824,384 +DA:825,0 +DA:827,0 +DA:828,0 +DA:829,0 +DA:830,0 +DA:834,384 +DA:836,384 +DA:849,5184 +DA:850,5184 +DA:851,5184 +DA:864,384 +DA:865,384 +DA:866,384 +DA:878,6144 +DA:879,6144 +DA:880,6144 +DA:897,10368 +DA:900,10368 +DA:901,10368 +DA:902,6528 +DA:905,6528 +DA:906,6528 +DA:907,6528 +DA:908,3264 +DA:909,3264 +DA:910,3264 +DA:911,0 +DA:912,0 +DA:913,0 +DA:914,0 +DA:915,0 +DA:916,0 +DA:917,576 +DA:919,576 +DA:920,576 +DA:922,10368 +DA:937,0 +DA:941,0 +DA:942,0 +DA:943,0 +DA:946,0 +DA:947,0 +DA:948,0 +DA:949,0 +DA:950,0 +DA:951,0 +DA:952,0 +DA:954,0 +DA:956,0 +DA:957,0 +DA:958,0 +DA:959,0 +DA:961,0 +DA:962,0 +DA:967,0 +DA:968,0 +DA:971,0 +DA:972,0 +DA:974,0 +DA:975,0 +DA:976,0 +DA:977,0 +DA:980,0 +DA:981,0 +DA:982,0 +DA:983,0 +DA:984,0 +DA:985,0 +DA:986,0 +DA:987,0 +DA:988,0 +DA:990,0 +DA:992,0 +DA:994,0 +DA:995,0 +DA:996,0 +DA:997,0 +DA:998,0 +DA:1000,0 +DA:1002,0 +DA:1004,0 +DA:1006,0 +DA:1014,0 +DA:1015,0 +DA:1016,0 +DA:1019,0 +DA:1020,0 +DA:1021,0 +DA:1022,0 +DA:1023,0 +DA:1024,0 +DA:1025,0 +DA:1026,0 +DA:1027,0 +DA:1028,0 +DA:1029,0 +DA:1037,0 +DA:1038,0 +DA:1039,0 +DA:1045,0 +DA:1046,0 +DA:1047,0 +DA:1048,0 +DA:1049,0 +DA:1050,0 +DA:1051,0 +DA:1052,0 +DA:1053,0 +DA:1054,0 +DA:1058,0 +DA:1059,0 +DA:1060,0 +DA:1061,0 +DA:1062,0 +DA:1065,0 +DA:1078,6528 +DA:1079,6528 +DA:1080,6528 +DA:1093,0 +DA:1094,0 +DA:1095,0 +DA:1108,4800 +DA:1109,4800 +DA:1110,4800 +DA:1123,384 +DA:1124,384 +DA:1125,384 +DA:1126,384 +DA:1146,384 +DA:1147,2688 +DA:1148,16128 +DA:1149,13824 +DA:1151,82944 +DA:1152,13824 +DA:1155,384 +DA:1171,384 +DA:1172,384 +DA:1175,384 +DA:1177,2304 +DA:1178,1920 +DA:1180,1920 +DA:1181,0 +DA:1182,0 +DA:1185,1920 +DA:1186,7680 +DA:1187,5760 +DA:1188,5760 +DA:1190,1920 +DA:1191,1920 +DA:1193,1920 +DA:1197,1920 +DA:1198,7680 +DA:1201,1920 +DA:1203,1920 +DA:1204,7680 +DA:1206,384 +DA:1228,384 +DA:1232,384 +DA:1233,2304 +DA:1234,1920 +DA:1235,7680 +DA:1237,1920 +DA:1239,384 +DA:1240,384 +DA:1256,384 +DA:1259,384 +DA:1260,384 +DA:1261,384 +DA:1262,384 +DA:1265,384 +DA:1268,2688 +DA:1269,384 +DA:1271,2688 +DA:1272,2304 +DA:1273,11520 +DA:1274,9216 +DA:1275,32256 +DA:1276,9216 +DA:1281,384 +DA:1282,384 +DA:1283,384 +DA:1284,384 +DA:1300,0 +DA:1302,0 +DA:1304,0 +DA:1307,0 +DA:1308,0 +DA:1309,0 +DA:1313,0 +DA:1315,0 +DA:1317,0 +DA:1318,0 +DA:1319,0 +DA:1320,0 +DA:1322,0 +DA:1323,0 +DA:1325,0 +DA:1329,0 +DA:1330,0 +DA:1333,0 +DA:1334,0 +DA:1335,0 +DA:1338,0 +DA:1339,0 +DA:1342,0 +DA:1343,0 +DA:1344,0 +DA:1345,0 +DA:1349,0 +DA:1350,0 +DA:1351,0 +DA:1352,0 +DA:1353,0 +DA:1354,0 +DA:1356,0 +DA:1361,0 +DA:1362,0 +DA:1364,0 +DA:1366,0 +DA:1367,0 +DA:1368,0 +DA:1369,0 +DA:1370,0 +DA:1372,0 +DA:1373,0 +DA:1374,0 +DA:1376,0 +DA:1379,0 +DA:1380,0 +DA:1381,0 +DA:1382,0 +DA:1383,0 +DA:1385,0 +DA:1387,0 +DA:1389,0 +DA:1390,0 +DA:1391,0 +DA:1393,0 +DA:1394,0 +DA:1396,0 +DA:1398,0 +DA:1399,0 +DA:1404,0 +DA:1405,0 +DA:1408,0 +DA:1411,0 +DA:1412,0 +DA:1413,0 +DA:1416,0 +DA:1420,0 +DA:1423,0 +DA:1424,0 +DA:1447,0 +DA:1450,0 +DA:1451,0 +DA:1452,0 +DA:1455,0 +DA:1456,0 +DA:1459,0 +DA:1460,0 +DA:1461,0 +DA:1462,0 +DA:1463,0 +DA:1471,0 +DA:1474,0 +DA:1475,0 +DA:1476,0 +DA:1477,0 +DA:1481,0 +DA:1483,0 +DA:1486,0 +DA:1489,0 +DA:1490,0 +DA:1491,0 +DA:1492,0 +DA:1493,0 +DA:1500,0 +DA:1503,0 +DA:1504,0 +DA:1505,0 +DA:1506,0 +DA:1536,864 +DA:1538,864 +DA:1541,480 +DA:1542,480 +DA:1543,480 +DA:1544,480 +DA:1545,480 +DA:1548,384 +DA:1549,384 +DA:1550,384 +DA:1551,384 +DA:1553,384 +DA:1555,384 +DA:1556,384 +DA:1557,384 +DA:1558,384 +DA:1559,384 +DA:1560,384 +DA:1561,384 +DA:1562,384 +DA:1563,384 +DA:1564,384 +DA:1565,384 +DA:1566,384 +DA:1567,384 +DA:1568,384 +DA:1569,384 +DA:1570,384 +DA:1571,384 +DA:1572,384 +DA:1573,384 +DA:1574,384 +DA:1575,384 +DA:1594,384 +DA:1596,384 +DA:1599,384 +DA:1600,384 +DA:1601,384 +DA:1602,384 +DA:1605,384 +DA:1606,384 +DA:1607,384 +DA:1608,384 +DA:1609,384 +DA:1610,384 +DA:1611,384 +DA:1612,384 +DA:1613,384 +DA:1614,384 +DA:1615,0 +DA:1616,0 +DA:1617,0 +DA:1619,384 +DA:1623,2688 +DA:1624,2304 +DA:1625,2304 +DA:1626,2304 +DA:1627,11520 +DA:1628,9216 +DA:1629,9216 +DA:1630,9216 +DA:1631,32256 +DA:1632,23040 +DA:1633,23040 +DA:1634,23040 +DA:1635,9216 +DA:1636,9216 +DA:1638,23040 +DA:1639,23040 +DA:1641,9216 +DA:1645,384 +DA:1646,384 +DA:1647,384 +DA:1648,384 +DA:1649,384 +DA:1650,384 +DA:1651,384 +DA:1652,384 +DA:1673,0 +DA:1675,0 +DA:1677,0 +DA:1680,0 +DA:1681,0 +DA:1682,0 +DA:1683,0 +DA:1684,0 +DA:1687,0 +DA:1688,0 +DA:1689,0 +DA:1691,0 +DA:1693,0 +DA:1694,0 +DA:1695,0 +DA:1696,0 +DA:1697,0 +DA:1698,0 +DA:1699,0 +DA:1700,0 +DA:1701,0 +DA:1702,0 +DA:1703,0 +DA:1704,0 +DA:1705,0 +DA:1706,0 +DA:1707,0 +DA:1708,0 +DA:1709,0 +DA:1710,0 +DA:1711,0 +DA:1732,0 +DA:1734,0 +DA:1736,0 +DA:1739,0 +DA:1740,0 +DA:1741,0 +DA:1742,0 +DA:1743,0 +DA:1746,0 +DA:1747,0 +DA:1748,0 +DA:1750,0 +DA:1752,0 +DA:1753,0 +DA:1754,0 +DA:1755,0 +DA:1756,0 +DA:1757,0 +DA:1758,0 +DA:1759,0 +DA:1760,0 +DA:1761,0 +DA:1762,0 +DA:1763,0 +DA:1764,0 +DA:1765,0 +DA:1766,0 +DA:1767,0 +DA:1768,0 +DA:1769,0 +DA:1770,0 +DA:1791,0 +DA:1793,0 +DA:1795,0 +DA:1798,0 +DA:1799,0 +DA:1800,0 +DA:1801,0 +DA:1802,0 +DA:1805,0 +DA:1806,0 +DA:1807,0 +DA:1809,0 +DA:1810,0 +DA:1812,0 +DA:1813,0 +DA:1814,0 +DA:1815,0 +DA:1816,0 +DA:1817,0 +DA:1818,0 +DA:1819,0 +DA:1820,0 +DA:1821,0 +DA:1822,0 +DA:1823,0 +DA:1824,0 +DA:1825,0 +DA:1826,0 +DA:1827,0 +DA:1828,0 +DA:1829,0 +DA:1830,0 +DA:1856,0 +DA:1862,0 +DA:1865,0 +DA:1871,0 +DA:1872,0 +DA:1873,0 +DA:1875,0 +DA:1876,0 +DA:1877,0 +DA:1880,0 +DA:1881,0 +DA:1882,0 +DA:1888,0 +DA:1889,0 +DA:1890,0 +DA:1891,0 +DA:1895,0 +DA:1896,0 +DA:1897,0 +DA:1898,0 +DA:1914,6528 +DA:1915,6528 +DA:1916,6528 +DA:1917,6528 +DA:1918,6528 +DA:1931,0 +DA:1932,0 +DA:1933,0 +DA:1946,0 +DA:1947,0 +DA:1948,0 +DA:1961,0 +DA:1963,0 +DA:1968,0 +DA:1969,0 +DA:1970,0 +DA:1973,0 +DA:1975,0 +DA:1976,0 +DA:1977,0 +DA:1981,0 +DA:1982,0 +DA:1983,0 +DA:1985,0 +DA:1987,0 +DA:1989,0 +DA:1993,0 +DA:1994,0 +DA:1995,0 +DA:1996,0 +DA:1997,0 +DA:1998,0 +DA:2000,0 +DA:2001,0 +DA:2002,0 +DA:2003,0 +DA:2008,0 +DA:2009,0 +DA:2010,0 +DA:2011,0 +DA:2012,0 +DA:2013,0 +DA:2014,0 +DA:2015,0 +DA:2016,0 +DA:2018,0 +DA:2019,0 +DA:2020,0 +DA:2021,0 +DA:2022,0 +DA:2023,0 +DA:2024,0 +DA:2026,0 +DA:2027,0 +DA:2028,0 +DA:2030,0 +DA:2031,0 +DA:2032,0 +DA:2035,0 +DA:2036,0 +DA:2060,4800 +DA:2061,4800 +DA:2062,4800 +DA:2063,4800 +DA:2064,4800 +DA:2088,0 +DA:2089,0 +DA:2090,0 +DA:2091,0 +DA:2092,0 +DA:2093,0 +DA:2116,0 +DA:2118,0 +DA:2119,0 +DA:2120,0 +DA:2122,0 +DA:2124,0 +DA:2147,0 +DA:2149,0 +DA:2150,0 +DA:2151,0 +DA:2152,0 +DA:2154,0 +DA:2156,0 +DA:2169,768 +DA:2170,768 +DA:2171,768 +DA:2183,0 +DA:2195,20736 +DA:2196,20736 +DA:2197,20736 +DA:2210,0 +DA:2211,0 +DA:2212,0 +DA:2225,13248 +DA:2226,13248 +DA:2227,13248 +DA:2240,10176 +DA:2241,10176 +DA:2242,10176 +DA:2255,5184 +DA:2256,5184 +DA:2257,5184 +DA:2258,5184 +DA:2271,10368 +DA:2272,10368 +DA:2273,10368 +DA:2286,5184 +DA:2287,5184 +DA:2288,5184 +DA:2289,5184 +DA:2302,0 +DA:2303,0 +DA:2304,0 +DA:2317,192 +DA:2318,192 +DA:2319,192 +DA:2332,0 +DA:2333,0 +DA:2335,0 +DA:2338,0 +DA:2341,0 +DA:2342,0 +DA:2343,0 +DA:2344,0 +DA:2345,0 +DA:2347,0 +DA:2352,0 +DA:2353,0 +DA:2366,4992 +DA:2369,4992 +DA:2370,4992 +DA:2371,4992 +DA:2372,4992 +DA:2385,0 +DA:2386,0 +DA:2388,0 +DA:2391,0 +DA:2394,0 +DA:2395,0 +DA:2396,0 +DA:2397,0 +DA:2398,0 +DA:2399,0 +DA:2401,0 +DA:2402,0 +DA:2408,0 +DA:2409,0 +DA:2422,384 +DA:2425,384 +DA:2426,384 +DA:2427,384 +DA:2428,384 +DA:2441,0 +DA:2442,0 +DA:2443,0 +DA:2456,0 +DA:2457,0 +DA:2458,0 +DA:2470,13440 +DA:2471,13440 +DA:2472,13056 +DA:2473,13056 +DA:2475,384 +DA:2476,384 +DA:2477,384 +DA:2478,384 +DA:2479,384 +DA:2480,384 +DA:2481,384 +DA:2482,384 +DA:2483,384 +DA:2484,384 +DA:2485,384 +DA:2486,384 +DA:2487,384 +DA:2488,384 +DA:2489,384 +DA:2503,384 +DA:2504,384 +DA:2507,1920 +DA:2509,1536 +DA:2511,1536 +DA:2512,1536 +DA:2513,1536 +DA:2514,9216 +DA:2515,7680 +DA:2516,7680 +DA:2517,7680 +DA:2520,1536 +DA:2521,1536 +DA:2523,7296 +DA:2524,5760 +DA:2525,5760 +DA:2526,34560 +DA:2527,28800 +DA:2528,28800 +DA:2529,28800 +DA:2531,5760 +DA:2532,5760 +DA:2535,1536 +DA:2536,1536 +DA:2537,1536 +DA:2538,1536 +DA:2539,1536 +DA:2541,384 +DA:2555,576 +DA:2556,576 +DA:2560,576 +DA:2561,576 +DA:2562,576 +DA:2563,96 +DA:2564,96 +DA:2566,576 +DA:2567,576 +DA:2569,1728 +DA:2571,1152 +DA:2573,1152 +DA:2574,1152 +DA:2575,1152 +DA:2576,4608 +DA:2577,3456 +DA:2578,3456 +DA:2579,3456 +DA:2582,1152 +DA:2583,1152 +DA:2584,1152 +DA:2586,14976 +DA:2587,13824 +DA:2588,13824 +DA:2589,55296 +DA:2590,41472 +DA:2591,41472 +DA:2592,41472 +DA:2594,13824 +DA:2595,13824 +DA:2596,13824 +DA:2599,1152 +DA:2600,1152 +DA:2601,192 +DA:2602,192 +DA:2604,1152 +DA:2605,1152 +DA:2607,576 +LF:1024 +LH:339 +end_of_record +TN: +SF:/home/jeremy/Dev/libCEED/interface/ceed-config.c +FNL:0,33,36 +FNA:0,0,CeedGetGitVersion +FNL:1,50,53 +FNA:1,0,CeedSetIsClang +FNL:2,67,70 +FNA:2,0,CeedGetIsClang +FNL:3,85,88 +FNA:3,0,CeedGetBuildConfiguration +FNF:4 +FNH:0 +DA:33,0 +DA:34,0 +DA:35,0 +DA:50,0 +DA:51,0 +DA:52,0 +DA:67,0 +DA:68,0 +DA:69,0 +DA:85,0 +DA:86,0 +DA:87,0 +LF:12 +LH:0 +end_of_record +TN: +SF:/home/jeremy/Dev/libCEED/interface/ceed-elemrestriction.c +FNL:0,1018,1060 +FNA:0,0,CeedElemRestrictionCreateBlockedOriented +FNL:1,1090,1133 +FNA:1,0,CeedElemRestrictionCreateBlockedCurlOriented +FNL:2,111,114 +FNA:2,0,CeedElemRestrictionView_Object +FNL:3,1154,1190 +FNA:3,288,CeedElemRestrictionCreateBlockedStrided +FNL:4,1204,1221 +FNA:4,0,CeedElemRestrictionCreateUnsignedCopy +FNL:5,1235,1252 +FNA:5,0,CeedElemRestrictionCreateUnorientedCopy +FNL:6,125,128 +FNA:6,0,CeedElemRestrictionDestroy_Object +FNL:7,1269,1274 +FNA:7,11568,CeedElemRestrictionReferenceCopy +FNL:8,1287,1298 +FNA:8,960,CeedElemRestrictionCreateVector +FNL:9,1314,1336 +FNA:9,576,CeedElemRestrictionApply +FNL:10,1353,1391 +FNA:10,0,CeedElemRestrictionApplyAtPointsInElement +FNL:11,1408,1446 +FNA:11,3072,CeedElemRestrictionApplyBlock +FNL:12,1458,1461 +FNA:12,2976,CeedElemRestrictionGetCeed +FNL:13,1472,1472 +FNA:13,0,CeedElemRestrictionReturnCeed +FNL:14,148,151 +FNA:14,7296,CeedElemRestrictionGetType +FNL:15,1484,1487 +FNA:15,5664,CeedElemRestrictionGetCompStride +FNL:16,1499,1502 +FNA:16,10656,CeedElemRestrictionGetNumElements +FNL:17,1514,1517 +FNA:17,16992,CeedElemRestrictionGetElementSize +FNL:18,1530,1539 +FNA:18,0,CeedElemRestrictionGetNumPoints +FNL:19,1553,1565 +FNA:19,0,CeedElemRestrictionGetNumPointsInElement +FNL:20,1578,1605 +FNA:20,0,CeedElemRestrictionGetMinMaxPointsInElement +FNL:21,1619,1621 +FNA:21,0,CeedElemRestrictionGetMaxPointsInElement +FNL:22,163,166 +FNA:22,0,CeedElemRestrictionIsStrided +FNL:23,1635,1637 +FNA:23,0,CeedElemRestrictionGetMinPointsInElement +FNL:24,1649,1652 +FNA:24,6432,CeedElemRestrictionGetLVectorSize +FNL:25,1664,1667 +FNA:25,1536,CeedElemRestrictionGetEVectorSize +FNL:26,1679,1682 +FNA:26,9888,CeedElemRestrictionGetNumComponents +FNL:27,1694,1697 +FNA:27,1872,CeedElemRestrictionGetNumBlocks +FNL:28,1709,1712 +FNA:28,8016,CeedElemRestrictionGetBlockSize +FNL:29,1724,1739 +FNA:29,0,CeedElemRestrictionGetMultiplicity +FNL:30,1751,1754 +FNA:30,0,CeedElemRestrictionSetNumViewTabs +FNL:31,1766,1769 +FNA:31,0,CeedElemRestrictionGetNumViewTabs +FNL:32,178,181 +FNA:32,0,CeedElemRestrictionIsAtPoints +FNL:33,1781,1818 +FNA:33,0,CeedElemRestrictionView +FNL:34,1829,1845 +FNA:34,25152,CeedElemRestrictionDestroy +FNL:35,194,222 +FNA:35,0,CeedElemRestrictionAtPointsAreCompatible +FNL:36,234,238 +FNA:36,288,CeedElemRestrictionGetStrides +FNL:37,250,255 +FNA:37,1536,CeedElemRestrictionHasBackendStrides +FNL:38,269,279 +FNA:38,432,CeedElemRestrictionGetOffsets +FNL:39,291,299 +FNA:39,432,CeedElemRestrictionRestoreOffsets +FNL:40,313,319 +FNA:40,0,CeedElemRestrictionGetOrientations +FNL:41,331,335 +FNA:41,0,CeedElemRestrictionRestoreOrientations +FNL:42,349,355 +FNA:42,0,CeedElemRestrictionGetCurlOrientations +FNL:43,367,371 +FNA:43,0,CeedElemRestrictionRestoreCurlOrientations +FNL:44,38,48 +FNA:44,432,CeedPermutePadOffsets +FNL:45,385,400 +FNA:45,0,CeedElemRestrictionGetLLayout +FNL:46,414,422 +FNA:46,480,CeedElemRestrictionSetLLayout +FNL:47,436,440 +FNA:47,0,CeedElemRestrictionGetELayout +FNL:48,454,457 +FNA:48,1296,CeedElemRestrictionSetELayout +FNL:49,472,496 +FNA:49,0,CeedElemRestrictionGetAtPointsElementOffset +FNL:50,509,521 +FNA:50,0,CeedElemRestrictionSetAtPointsEVectorSize +FNL:51,533,536 +FNA:51,7680,CeedElemRestrictionGetData +FNL:52,548,551 +FNA:52,1296,CeedElemRestrictionSetData +FNL:53,562,565 +FNA:53,9456,CeedElemRestrictionReference +FNL:54,576,615 +FNA:54,0,CeedElemRestrictionGetFlopsEstimate +FNL:55,64,73 +FNA:55,0,CeedPermutePadOrients +FNL:56,657,687 +FNA:56,720,CeedElemRestrictionCreate +FNL:57,712,744 +FNA:57,0,CeedElemRestrictionCreateOriented +FNL:58,770,802 +FNA:58,0,CeedElemRestrictionCreateCurlOriented +FNL:59,824,857 +FNA:59,360,CeedElemRestrictionCreateStrided +FNL:60,89,99 +FNA:60,0,CeedPermutePadCurlOrients +FNL:61,891,923 +FNA:61,0,CeedElemRestrictionCreateAtPoints +FNL:62,950,989 +FNA:62,432,CeedElemRestrictionCreateBlocked +FNF:63 +FNH:29 +DA:38,432 +DA:40,3024 +DA:41,9216 +DA:42,166464 +DA:43,159840 +DA:47,432 +DA:64,0 +DA:65,0 +DA:66,0 +DA:67,0 +DA:68,0 +DA:72,0 +DA:89,0 +DA:91,0 +DA:92,0 +DA:93,0 +DA:94,0 +DA:98,0 +DA:111,0 +DA:112,0 +DA:113,0 +DA:125,0 +DA:126,0 +DA:127,0 +DA:148,7296 +DA:149,7296 +DA:150,7296 +DA:163,0 +DA:164,0 +DA:165,0 +DA:178,0 +DA:179,0 +DA:180,0 +DA:194,0 +DA:198,0 +DA:200,0 +DA:203,0 +DA:204,0 +DA:205,0 +DA:206,0 +DA:209,0 +DA:210,0 +DA:211,0 +DA:212,0 +DA:215,0 +DA:216,0 +DA:217,0 +DA:218,0 +DA:219,0 +DA:221,0 +DA:234,288 +DA:235,288 +DA:236,1152 +DA:237,288 +DA:250,1536 +DA:251,1536 +DA:252,3072 +DA:253,1536 +DA:254,1536 +DA:269,432 +DA:270,432 +DA:271,0 +DA:273,432 +DA:275,432 +DA:276,432 +DA:278,432 +DA:291,432 +DA:292,432 +DA:293,0 +DA:295,432 +DA:296,432 +DA:298,432 +DA:313,0 +DA:314,0 +DA:316,0 +DA:317,0 +DA:318,0 +DA:331,0 +DA:332,0 +DA:333,0 +DA:334,0 +DA:349,0 +DA:350,0 +DA:352,0 +DA:353,0 +DA:354,0 +DA:367,0 +DA:368,0 +DA:369,0 +DA:370,0 +DA:385,0 +DA:389,0 +DA:390,0 +DA:392,0 +DA:393,0 +DA:394,0 +DA:395,0 +DA:397,0 +DA:399,0 +DA:414,480 +DA:417,480 +DA:418,480 +DA:420,1920 +DA:421,480 +DA:436,0 +DA:437,0 +DA:438,0 +DA:439,0 +DA:454,1296 +DA:455,5184 +DA:456,1296 +DA:472,0 +DA:476,0 +DA:477,0 +DA:481,0 +DA:482,0 +DA:483,0 +DA:487,0 +DA:488,0 +DA:489,0 +DA:492,0 +DA:493,0 +DA:495,0 +DA:509,0 +DA:512,0 +DA:513,0 +DA:515,0 +DA:519,0 +DA:520,0 +DA:533,7680 +DA:534,7680 +DA:535,7680 +DA:548,1296 +DA:549,1296 +DA:550,1296 +DA:562,9456 +DA:563,9456 +DA:564,9456 +DA:576,0 +DA:577,0 +DA:580,0 +DA:581,0 +DA:582,0 +DA:583,0 +DA:584,0 +DA:585,0 +DA:586,0 +DA:587,0 +DA:589,0 +DA:590,0 +DA:591,0 +DA:592,0 +DA:593,0 +DA:594,0 +DA:595,0 +DA:596,0 +DA:599,0 +DA:600,0 +DA:603,0 +DA:604,0 +DA:605,0 +DA:606,0 +DA:607,0 +DA:608,0 +DA:609,0 +DA:610,0 +DA:613,0 +DA:614,0 +DA:657,720 +DA:659,720 +DA:662,336 +DA:663,336 +DA:664,336 +DA:665,336 +DA:666,336 +DA:669,384 +DA:670,384 +DA:671,384 +DA:672,384 +DA:674,384 +DA:675,384 +DA:676,384 +DA:677,384 +DA:678,384 +DA:679,384 +DA:680,384 +DA:681,384 +DA:682,384 +DA:683,384 +DA:684,384 +DA:685,384 +DA:686,384 +DA:712,0 +DA:715,0 +DA:718,0 +DA:719,0 +DA:720,0 +DA:722,0 +DA:723,0 +DA:726,0 +DA:727,0 +DA:728,0 +DA:729,0 +DA:731,0 +DA:732,0 +DA:733,0 +DA:734,0 +DA:735,0 +DA:736,0 +DA:737,0 +DA:738,0 +DA:739,0 +DA:740,0 +DA:741,0 +DA:742,0 +DA:743,0 +DA:770,0 +DA:773,0 +DA:776,0 +DA:777,0 +DA:778,0 +DA:780,0 +DA:781,0 +DA:784,0 +DA:785,0 +DA:786,0 +DA:787,0 +DA:789,0 +DA:790,0 +DA:791,0 +DA:792,0 +DA:793,0 +DA:794,0 +DA:795,0 +DA:796,0 +DA:797,0 +DA:798,0 +DA:799,0 +DA:800,0 +DA:801,0 +DA:824,360 +DA:826,360 +DA:829,168 +DA:830,168 +DA:831,168 +DA:832,168 +DA:833,168 +DA:836,192 +DA:837,192 +DA:838,192 +DA:839,192 +DA:843,192 +DA:844,192 +DA:845,192 +DA:846,192 +DA:847,192 +DA:848,192 +DA:849,192 +DA:850,192 +DA:851,192 +DA:852,192 +DA:853,192 +DA:854,768 +DA:855,192 +DA:856,192 +DA:891,0 +DA:893,0 +DA:896,0 +DA:897,0 +DA:898,0 +DA:899,0 +DA:900,0 +DA:903,0 +DA:904,0 +DA:905,0 +DA:906,0 +DA:910,0 +DA:911,0 +DA:912,0 +DA:913,0 +DA:914,0 +DA:915,0 +DA:916,0 +DA:917,0 +DA:918,0 +DA:919,0 +DA:920,0 +DA:921,0 +DA:922,0 +DA:950,432 +DA:953,432 +DA:955,432 +DA:958,0 +DA:959,0 +DA:960,0 +DA:962,0 +DA:963,0 +DA:966,432 +DA:967,432 +DA:968,432 +DA:969,432 +DA:970,432 +DA:972,432 +DA:973,432 +DA:975,432 +DA:976,432 +DA:977,432 +DA:978,432 +DA:979,432 +DA:980,432 +DA:981,432 +DA:982,432 +DA:983,432 +DA:984,432 +DA:985,432 +DA:986,432 +DA:987,432 +DA:988,432 +DA:1018,0 +DA:1022,0 +DA:1024,0 +DA:1027,0 +DA:1028,0 +DA:1029,0 +DA:1031,0 +DA:1032,0 +DA:1035,0 +DA:1036,0 +DA:1037,0 +DA:1038,0 +DA:1040,0 +DA:1041,0 +DA:1042,0 +DA:1043,0 +DA:1045,0 +DA:1046,0 +DA:1047,0 +DA:1048,0 +DA:1049,0 +DA:1050,0 +DA:1051,0 +DA:1052,0 +DA:1053,0 +DA:1054,0 +DA:1055,0 +DA:1056,0 +DA:1058,0 +DA:1059,0 +DA:1090,0 +DA:1094,0 +DA:1096,0 +DA:1099,0 +DA:1100,0 +DA:1101,0 +DA:1103,0 +DA:1104,0 +DA:1107,0 +DA:1108,0 +DA:1109,0 +DA:1110,0 +DA:1111,0 +DA:1113,0 +DA:1114,0 +DA:1115,0 +DA:1116,0 +DA:1118,0 +DA:1119,0 +DA:1120,0 +DA:1121,0 +DA:1122,0 +DA:1123,0 +DA:1124,0 +DA:1125,0 +DA:1126,0 +DA:1127,0 +DA:1128,0 +DA:1129,0 +DA:1131,0 +DA:1132,0 +DA:1154,288 +DA:1156,288 +DA:1158,288 +DA:1161,0 +DA:1162,0 +DA:1163,0 +DA:1164,0 +DA:1165,0 +DA:1168,288 +DA:1169,288 +DA:1170,288 +DA:1171,288 +DA:1172,288 +DA:1176,288 +DA:1177,288 +DA:1178,288 +DA:1179,288 +DA:1180,288 +DA:1181,288 +DA:1182,288 +DA:1183,288 +DA:1184,288 +DA:1185,288 +DA:1186,288 +DA:1187,1152 +DA:1188,288 +DA:1189,288 +DA:1204,0 +DA:1205,0 +DA:1208,0 +DA:1209,0 +DA:1211,0 +DA:1212,0 +DA:1213,0 +DA:1214,0 +DA:1216,0 +DA:1219,0 +DA:1220,0 +DA:1235,0 +DA:1236,0 +DA:1239,0 +DA:1240,0 +DA:1242,0 +DA:1243,0 +DA:1244,0 +DA:1245,0 +DA:1247,0 +DA:1250,0 +DA:1251,0 +DA:1269,11568 +DA:1270,11568 +DA:1271,11568 +DA:1272,11568 +DA:1273,11568 +DA:1287,960 +DA:1291,960 +DA:1292,960 +DA:1293,960 +DA:1294,960 +DA:1295,960 +DA:1296,960 +DA:1297,960 +DA:1314,576 +DA:1318,576 +DA:1319,384 +DA:1320,384 +DA:1322,192 +DA:1323,192 +DA:1325,576 +DA:1326,576 +DA:1329,576 +DA:1330,576 +DA:1333,576 +DA:1334,576 +DA:1335,576 +DA:1353,0 +DA:1358,0 +DA:1361,0 +DA:1364,0 +DA:1365,0 +DA:1366,0 +DA:1367,0 +DA:1371,0 +DA:1372,0 +DA:1373,0 +DA:1374,0 +DA:1376,0 +DA:1377,0 +DA:1381,0 +DA:1382,0 +DA:1386,0 +DA:1387,0 +DA:1389,0 +DA:1390,0 +DA:1408,3072 +DA:1413,3072 +DA:1416,3072 +DA:1417,3072 +DA:1420,1536 +DA:1421,1536 +DA:1422,1536 +DA:1423,1536 +DA:1427,1536 +DA:1428,1536 +DA:1429,1536 +DA:1430,1536 +DA:1432,3072 +DA:1433,3072 +DA:1436,3072 +DA:1437,3072 +DA:1440,3072 +DA:1441,3072 +DA:1444,3072 +DA:1445,3072 +DA:1458,2976 +DA:1459,2976 +DA:1460,2976 +DA:1472,0 +DA:1484,5664 +DA:1485,5664 +DA:1486,5664 +DA:1499,10656 +DA:1500,10656 +DA:1501,10656 +DA:1514,16992 +DA:1515,16992 +DA:1516,16992 +DA:1530,0 +DA:1533,0 +DA:1534,0 +DA:1537,0 +DA:1538,0 +DA:1553,0 +DA:1557,0 +DA:1558,0 +DA:1561,0 +DA:1562,0 +DA:1563,0 +DA:1564,0 +DA:1578,0 +DA:1582,0 +DA:1583,0 +DA:1586,0 +DA:1589,0 +DA:1590,0 +DA:1591,0 +DA:1592,0 +DA:1596,0 +DA:1597,0 +DA:1598,0 +DA:1599,0 +DA:1600,0 +DA:1601,0 +DA:1602,0 +DA:1604,0 +DA:1619,0 +DA:1620,0 +DA:1635,0 +DA:1636,0 +DA:1649,6432 +DA:1650,6432 +DA:1651,6432 +DA:1664,1536 +DA:1665,1536 +DA:1666,1536 +DA:1679,9888 +DA:1680,9888 +DA:1681,9888 +DA:1694,1872 +DA:1695,1872 +DA:1696,1872 +DA:1709,8016 +DA:1710,8016 +DA:1711,8016 +DA:1724,0 +DA:1728,0 +DA:1731,0 +DA:1732,0 +DA:1734,0 +DA:1735,0 +DA:1737,0 +DA:1738,0 +DA:1751,0 +DA:1752,0 +DA:1753,0 +DA:1766,0 +DA:1767,0 +DA:1768,0 +DA:1781,0 +DA:1782,0 +DA:1786,0 +DA:1788,0 +DA:1789,0 +DA:1790,0 +DA:1793,0 +DA:1794,0 +DA:1797,0 +DA:1798,0 +DA:1805,0 +DA:1806,0 +DA:1808,0 +DA:1810,0 +DA:1813,0 +DA:1814,0 +DA:1816,0 +DA:1817,0 +DA:1829,25152 +DA:1830,25152 +DA:1831,24144 +DA:1832,24144 +DA:1834,1008 +DA:1838,1008 +DA:1839,1008 +DA:1841,1008 +DA:1842,1008 +DA:1843,1008 +DA:1844,1008 +LF:596 +LH:225 +end_of_record +TN: +SF:/home/jeremy/Dev/libCEED/interface/ceed-fortran.c +FNL:0,1031,1036 +FNA:0,0,ceedoperatorcompositeaddsub_ +FNL:1,1039,1044 +FNA:1,0,ceedoperatorsetname_ +FNL:2,1047,1051 +FNA:2,0,ceedoperatorsetnumviewtabs_ +FNL:3,1054,1098 +FNA:3,0,ceedoperatorlinearassembleqfunction_ +FNL:4,1101,1124 +FNA:4,0,ceedoperatorlinearassemblediagonal_ +FNL:5,1127,1148 +FNA:5,0,ceedoperatormultigridlevelcreate_ +FNL:6,1151,1172 +FNA:6,0,ceedoperatormultigridlevelcreatetensorh1_ +FNL:7,1175,1196 +FNA:7,0,ceedoperatormultigridlevelcreateh1_ +FNL:8,119,132 +FNA:8,384,ceedvectorcreate_ +FNL:9,1199,1203 +FNA:9,0,ceedoperatorview_ +FNL:10,1206,1241 +FNA:10,0,ceedoperatorcreatefdmelementinverse_ +FNL:11,1244,1270 +FNA:11,192,ceedoperatorapply_ +FNL:12,1273,1299 +FNA:12,0,ceedoperatorapplyadd_ +FNL:13,1302,1307 +FNA:13,0,ceedoperatorapplyjacobian_ +FNL:14,1310,1322 +FNA:14,192,ceedoperatordestroy_ +FNL:15,135,137 +FNA:15,288,ceedvectorsetarray_ +FNL:16,140,145 +FNA:16,0,ceedvectortakearray_ +FNL:17,148,148 +FNA:17,0,ceedvectorsyncarray_ +FNL:18,151,151 +FNA:18,96,ceedvectorsetvalue_ +FNL:19,154,159 +FNA:19,0,ceedvectorgetarray_ +FNL:20,162,167 +FNA:20,96,ceedvectorgetarrayread_ +FNL:21,170,175 +FNA:21,0,ceedvectorgetarraywrite_ +FNL:22,178,182 +FNA:22,0,ceedvectorrestorearray_ +FNL:23,185,188 +FNA:23,96,ceedvectorrestorearrayread_ +FNL:24,191,193 +FNA:24,0,ceedvectornorm_ +FNL:25,196,196 +FNA:25,0,ceedvectorreciprocal_ +FNL:26,199,199 +FNA:26,0,ceedvectorsetnumviewtabs_ +FNL:27,202,202 +FNA:27,0,ceedvectorview_ +FNL:28,205,218 +FNA:28,384,ceedvectordestroy_ +FNL:29,229,246 +FNA:29,192,ceedelemrestrictioncreate_ +FNL:30,249,267 +FNA:30,0,ceedelemrestrictioncreateoriented_ +FNL:31,270,289 +FNA:31,0,ceedelemrestrictioncreatecurloriented_ +FNL:32,292,307 +FNA:32,96,ceedelemrestrictioncreatestrided_ +FNL:33,310,327 +FNA:33,0,ceedelemrestrictioncreateblocked_ +FNL:34,330,349 +FNA:34,0,ceedelemrestrictioncreateblockedoriented_ +FNL:35,353,372 +FNA:35,0,ceedelemrestrictioncreateblockedcurloriented_ +FNL:36,375,389 +FNA:36,0,ceedelemrestrictioncreateblockedstrided_ +FNL:37,397,419 +FNA:37,0,ceedelemrestrictionapply_ +FNL:38,422,444 +FNA:38,0,ceedelemrestrictionapplyblock_ +FNL:39,447,449 +FNA:39,0,ceedelemrestrictiongetmultiplicity_ +FNL:40,452,456 +FNA:40,0,ceedelemrestrictiongetelayout_ +FNL:41,459,461 +FNA:41,0,ceedelemrestrictionsetnumviewtabs_ +FNL:42,464,464 +FNA:42,0,ceedelemrestrictionview_ +FNL:43,467,479 +FNA:43,0,ceedrequestwait_ +FNL:44,482,495 +FNA:44,0,ceedelemrestrictiondestroy_ +FNL:45,506,518 +FNA:45,192,ceedbasiscreatetensorh1lagrange_ +FNL:46,521,536 +FNA:46,0,ceedbasiscreatetensorh1_ +FNL:47,539,553 +FNA:47,0,ceedbasiscreateh1_ +FNL:48,556,570 +FNA:48,0,ceedbasiscreatehdiv_ +FNL:49,573,587 +FNA:49,0,ceedbasiscreatehcurl_ +FNL:50,590,590 +FNA:50,0,ceedbasissetnumviewtabs_ +FNL:51,593,593 +FNA:51,0,ceedbasisview_ +FNL:52,596,598 +FNA:52,0,ceedbasisgetcollocatedgrad_ +FNL:53,601,604 +FNA:53,0,ceedbasisapply_ +FNL:54,607,607 +FNA:54,0,ceedbasisgetnumnodes_ +FNL:55,610,610 +FNA:55,0,ceedbasisgetnumquadraturepoints_ +FNL:56,613,618 +FNA:56,0,ceedbasisgetinterp1d_ +FNL:57,621,626 +FNA:57,0,ceedbasisgetgrad1d_ +FNL:58,629,634 +FNA:58,0,ceedbasisgetqref_ +FNL:59,637,650 +FNA:59,192,ceedbasisdestroy_ +FNL:60,64,78 +FNA:60,96,ceedinit_ +FNL:61,653,655 +FNA:61,0,ceedgaussquadrature_ +FNL:62,658,660 +FNA:62,96,ceedlobattoquadrature_ +FNL:63,671,683 +FNA:63,96,ceedqfunctioncontextcreate_ +FNL:64,686,689 +FNA:64,96,ceedqfunctioncontextsetdata_ +FNL:65,692,697 +FNA:65,0,ceedqfunctioncontextgetdata_ +FNL:66,700,703 +FNA:66,0,ceedqfunctioncontextrestoredata_ +FNL:67,706,708 +FNA:67,0,ceedqfunctioncontextsetnumviewtabs_ +FNL:68,711,711 +FNA:68,0,ceedqfunctioncontextview_ +FNL:69,714,727 +FNA:69,96,ceedqfunctioncontextdestroy_ +FNL:70,737,760 +FNA:70,768,CeedQFunctionFortranStub +FNL:71,763,801 +FNA:71,96,ceedqfunctioncreateinterior_ +FNL:72,804,818 +FNA:72,96,ceedqfunctioncreateinteriorbyname_ +FNL:73,81,83 +FNA:73,0,ceedisdeterministic_ +FNL:74,821,834 +FNA:74,0,ceedqfunctioncreateidentity_ +FNL:75,837,843 +FNA:75,192,ceedqfunctionaddinput_ +FNL:76,846,852 +FNA:76,96,ceedqfunctionaddoutput_ +FNL:77,855,869 +FNA:77,48,ceedqfunctionsetcontext_ +FNL:78,86,86 +FNA:78,0,ceedgetpreferredmemtype_ +FNL:79,872,874 +FNA:79,0,ceedqfunctionsetnumviewtabs_ +FNL:80,877,881 +FNA:80,0,ceedqfunctionview_ +FNL:81,885,933 +FNA:81,0,ceedqfunctionapply_ +FNL:82,89,89 +FNA:82,0,ceedsetnumviewtabs_ +FNL:83,92,92 +FNA:83,0,ceedview_ +FNL:84,936,949 +FNA:84,192,ceedqfunctiondestroy_ +FNL:85,95,108 +FNA:85,96,ceeddestroy_ +FNL:86,960,976 +FNA:86,192,ceedoperatorcreate_ +FNL:87,979,991 +FNA:87,0,ceedoperatorcreatecomposite_ +FNL:88,994,1028 +FNA:88,576,ceedoperatorsetfield_ +FNF:89 +FNH:27 +DA:64,96 +DA:65,96 +DA:66,96 +DA:67,96 +DA:68,96 +DA:71,96 +DA:72,96 +DA:74,96 +DA:75,96 +DA:76,96 +DA:78,96 +DA:81,0 +DA:82,0 +DA:83,0 +DA:86,0 +DA:89,0 +DA:92,0 +DA:95,96 +DA:96,96 +DA:97,96 +DA:99,96 +DA:100,96 +DA:101,96 +DA:102,96 +DA:103,96 +DA:104,96 +DA:105,96 +DA:119,384 +DA:120,384 +DA:121,288 +DA:122,288 +DA:125,384 +DA:126,384 +DA:128,384 +DA:129,384 +DA:130,384 +DA:132,384 +DA:135,288 +DA:136,288 +DA:137,288 +DA:140,0 +DA:142,0 +DA:143,0 +DA:144,0 +DA:145,0 +DA:148,0 +DA:151,96 +DA:154,0 +DA:156,0 +DA:157,0 +DA:158,0 +DA:159,0 +DA:162,96 +DA:164,96 +DA:165,96 +DA:166,96 +DA:167,96 +DA:170,0 +DA:172,0 +DA:173,0 +DA:174,0 +DA:175,0 +DA:178,0 +DA:179,0 +DA:180,0 +DA:181,0 +DA:182,0 +DA:185,96 +DA:186,96 +DA:187,96 +DA:188,96 +DA:191,0 +DA:192,0 +DA:193,0 +DA:196,0 +DA:199,0 +DA:202,0 +DA:205,384 +DA:206,384 +DA:207,384 +DA:209,384 +DA:210,384 +DA:211,384 +DA:212,384 +DA:213,96 +DA:214,96 +DA:215,96 +DA:229,192 +DA:231,192 +DA:232,192 +DA:233,192 +DA:236,192 +DA:238,192 +DA:239,384 +DA:240,192 +DA:242,192 +DA:243,192 +DA:244,192 +DA:246,192 +DA:249,0 +DA:251,0 +DA:252,0 +DA:253,0 +DA:256,0 +DA:257,0 +DA:259,0 +DA:260,0 +DA:261,0 +DA:263,0 +DA:264,0 +DA:265,0 +DA:267,0 +DA:270,0 +DA:273,0 +DA:274,0 +DA:275,0 +DA:278,0 +DA:279,0 +DA:281,0 +DA:282,0 +DA:283,0 +DA:285,0 +DA:286,0 +DA:287,0 +DA:289,0 +DA:292,96 +DA:294,96 +DA:295,96 +DA:296,96 +DA:299,96 +DA:300,96 +DA:301,96 +DA:303,96 +DA:304,96 +DA:305,96 +DA:307,96 +DA:310,0 +DA:312,0 +DA:313,0 +DA:314,0 +DA:317,0 +DA:319,0 +DA:320,0 +DA:321,0 +DA:323,0 +DA:324,0 +DA:325,0 +DA:327,0 +DA:330,0 +DA:333,0 +DA:334,0 +DA:335,0 +DA:338,0 +DA:339,0 +DA:341,0 +DA:342,0 +DA:343,0 +DA:345,0 +DA:346,0 +DA:347,0 +DA:349,0 +DA:353,0 +DA:356,0 +DA:357,0 +DA:358,0 +DA:361,0 +DA:362,0 +DA:364,0 +DA:365,0 +DA:366,0 +DA:368,0 +DA:369,0 +DA:370,0 +DA:372,0 +DA:375,0 +DA:377,0 +DA:378,0 +DA:379,0 +DA:382,0 +DA:383,0 +DA:385,0 +DA:386,0 +DA:387,0 +DA:389,0 +DA:397,0 +DA:398,0 +DA:400,0 +DA:402,0 +DA:403,0 +DA:404,0 +DA:408,0 +DA:409,0 +DA:410,0 +DA:412,0 +DA:413,0 +DA:415,0 +DA:416,0 +DA:417,0 +DA:419,0 +DA:422,0 +DA:423,0 +DA:425,0 +DA:427,0 +DA:428,0 +DA:429,0 +DA:433,0 +DA:434,0 +DA:435,0 +DA:437,0 +DA:438,0 +DA:440,0 +DA:441,0 +DA:442,0 +DA:444,0 +DA:447,0 +DA:448,0 +DA:449,0 +DA:452,0 +DA:454,0 +DA:455,0 +DA:456,0 +DA:459,0 +DA:460,0 +DA:461,0 +DA:464,0 +DA:467,0 +DA:471,0 +DA:472,0 +DA:473,0 +DA:474,0 +DA:475,0 +DA:476,0 +DA:479,0 +DA:482,0 +DA:483,0 +DA:484,0 +DA:486,0 +DA:487,0 +DA:488,0 +DA:489,0 +DA:490,0 +DA:491,0 +DA:492,0 +DA:506,192 +DA:507,192 +DA:508,192 +DA:509,192 +DA:512,192 +DA:514,192 +DA:515,192 +DA:516,192 +DA:518,192 +DA:521,0 +DA:524,0 +DA:525,0 +DA:526,0 +DA:529,0 +DA:530,0 +DA:532,0 +DA:533,0 +DA:534,0 +DA:536,0 +DA:539,0 +DA:541,0 +DA:542,0 +DA:543,0 +DA:546,0 +DA:547,0 +DA:549,0 +DA:550,0 +DA:551,0 +DA:553,0 +DA:556,0 +DA:558,0 +DA:559,0 +DA:560,0 +DA:563,0 +DA:564,0 +DA:566,0 +DA:567,0 +DA:568,0 +DA:570,0 +DA:573,0 +DA:575,0 +DA:576,0 +DA:577,0 +DA:580,0 +DA:581,0 +DA:583,0 +DA:584,0 +DA:585,0 +DA:587,0 +DA:590,0 +DA:593,0 +DA:596,0 +DA:597,0 +DA:598,0 +DA:601,0 +DA:602,0 +DA:603,0 +DA:604,0 +DA:607,0 +DA:610,0 +DA:613,0 +DA:615,0 +DA:616,0 +DA:617,0 +DA:618,0 +DA:621,0 +DA:623,0 +DA:624,0 +DA:625,0 +DA:626,0 +DA:629,0 +DA:631,0 +DA:632,0 +DA:633,0 +DA:634,0 +DA:637,192 +DA:638,192 +DA:639,192 +DA:641,192 +DA:642,192 +DA:643,192 +DA:644,192 +DA:645,96 +DA:646,96 +DA:647,96 +DA:653,0 +DA:654,0 +DA:655,0 +DA:658,96 +DA:659,96 +DA:660,96 +DA:671,96 +DA:672,96 +DA:673,96 +DA:674,96 +DA:677,96 +DA:679,96 +DA:680,96 +DA:681,96 +DA:682,96 +DA:686,96 +DA:687,96 +DA:688,96 +DA:689,96 +DA:692,0 +DA:694,0 +DA:695,0 +DA:696,0 +DA:697,0 +DA:700,0 +DA:701,0 +DA:702,0 +DA:703,0 +DA:706,0 +DA:707,0 +DA:708,0 +DA:711,0 +DA:714,96 +DA:715,96 +DA:716,96 +DA:718,96 +DA:719,96 +DA:720,96 +DA:721,96 +DA:722,96 +DA:723,96 +DA:724,96 +DA:737,768 +DA:738,768 +DA:739,768 +DA:742,768 +DA:746,768 +DA:747,384 +DA:748,384 +DA:751,768 +DA:752,768 +DA:754,768 +DA:755,384 +DA:756,384 +DA:759,768 +DA:763,96 +DA:771,96 +DA:772,96 +DA:773,96 +DA:774,96 +DA:777,96 +DA:778,96 +DA:780,96 +DA:781,96 +DA:782,96 +DA:786,96 +DA:787,96 +DA:788,96 +DA:789,96 +DA:791,96 +DA:792,96 +DA:793,96 +DA:794,96 +DA:795,96 +DA:796,96 +DA:797,96 +DA:798,96 +DA:800,96 +DA:804,96 +DA:805,96 +DA:806,96 +DA:807,96 +DA:808,96 +DA:811,96 +DA:812,96 +DA:814,96 +DA:815,96 +DA:816,96 +DA:818,96 +DA:821,0 +DA:822,0 +DA:823,0 +DA:824,0 +DA:827,0 +DA:828,0 +DA:830,0 +DA:831,0 +DA:832,0 +DA:834,0 +DA:837,192 +DA:839,192 +DA:840,192 +DA:842,192 +DA:843,192 +DA:846,96 +DA:848,96 +DA:849,96 +DA:851,96 +DA:852,96 +DA:855,48 +DA:856,48 +DA:857,48 +DA:860,48 +DA:861,48 +DA:863,48 +DA:864,48 +DA:865,48 +DA:866,48 +DA:867,48 +DA:868,48 +DA:872,0 +DA:873,0 +DA:874,0 +DA:877,0 +DA:878,0 +DA:880,0 +DA:881,0 +DA:885,0 +DA:888,0 +DA:890,0 +DA:891,0 +DA:892,0 +DA:893,0 +DA:894,0 +DA:895,0 +DA:896,0 +DA:897,0 +DA:898,0 +DA:899,0 +DA:900,0 +DA:901,0 +DA:902,0 +DA:903,0 +DA:904,0 +DA:905,0 +DA:906,0 +DA:907,0 +DA:909,0 +DA:910,0 +DA:911,0 +DA:912,0 +DA:913,0 +DA:914,0 +DA:915,0 +DA:916,0 +DA:917,0 +DA:918,0 +DA:919,0 +DA:920,0 +DA:921,0 +DA:922,0 +DA:923,0 +DA:924,0 +DA:925,0 +DA:926,0 +DA:927,0 +DA:928,0 +DA:930,0 +DA:931,0 +DA:932,0 +DA:936,192 +DA:937,192 +DA:939,192 +DA:940,192 +DA:941,192 +DA:942,192 +DA:943,192 +DA:944,96 +DA:945,96 +DA:946,96 +DA:960,192 +DA:961,192 +DA:962,192 +DA:963,192 +DA:966,192 +DA:968,192 +DA:969,192 +DA:970,192 +DA:972,192 +DA:973,192 +DA:974,192 +DA:975,192 +DA:979,0 +DA:980,0 +DA:981,0 +DA:982,0 +DA:985,0 +DA:987,0 +DA:988,0 +DA:989,0 +DA:990,0 +DA:994,576 +DA:995,576 +DA:1000,576 +DA:1002,576 +DA:1003,0 +DA:1004,576 +DA:1005,96 +DA:1007,480 +DA:1010,576 +DA:1011,0 +DA:1012,576 +DA:1013,192 +DA:1015,384 +DA:1017,576 +DA:1018,0 +DA:1019,576 +DA:1020,384 +DA:1021,192 +DA:1022,96 +DA:1024,96 +DA:1027,576 +DA:1028,576 +DA:1031,0 +DA:1032,0 +DA:1033,0 +DA:1035,0 +DA:1036,0 +DA:1039,0 +DA:1040,0 +DA:1041,0 +DA:1043,0 +DA:1044,0 +DA:1047,0 +DA:1048,0 +DA:1050,0 +DA:1051,0 +DA:1054,0 +DA:1056,0 +DA:1057,0 +DA:1058,0 +DA:1060,0 +DA:1063,0 +DA:1064,0 +DA:1065,0 +DA:1067,0 +DA:1069,0 +DA:1071,0 +DA:1072,0 +DA:1075,0 +DA:1076,0 +DA:1077,0 +DA:1081,0 +DA:1082,0 +DA:1083,0 +DA:1085,0 +DA:1086,0 +DA:1087,0 +DA:1088,0 +DA:1089,0 +DA:1092,0 +DA:1093,0 +DA:1094,0 +DA:1095,0 +DA:1096,0 +DA:1101,0 +DA:1102,0 +DA:1104,0 +DA:1105,0 +DA:1108,0 +DA:1109,0 +DA:1110,0 +DA:1114,0 +DA:1115,0 +DA:1116,0 +DA:1118,0 +DA:1119,0 +DA:1120,0 +DA:1121,0 +DA:1122,0 +DA:1127,0 +DA:1133,0 +DA:1134,0 +DA:1136,0 +DA:1137,0 +DA:1138,0 +DA:1140,0 +DA:1141,0 +DA:1142,0 +DA:1143,0 +DA:1144,0 +DA:1145,0 +DA:1146,0 +DA:1147,0 +DA:1151,0 +DA:1157,0 +DA:1158,0 +DA:1160,0 +DA:1161,0 +DA:1162,0 +DA:1164,0 +DA:1165,0 +DA:1166,0 +DA:1167,0 +DA:1168,0 +DA:1169,0 +DA:1170,0 +DA:1171,0 +DA:1175,0 +DA:1181,0 +DA:1182,0 +DA:1184,0 +DA:1185,0 +DA:1186,0 +DA:1188,0 +DA:1189,0 +DA:1190,0 +DA:1191,0 +DA:1192,0 +DA:1193,0 +DA:1194,0 +DA:1195,0 +DA:1199,0 +DA:1200,0 +DA:1202,0 +DA:1203,0 +DA:1206,0 +DA:1208,0 +DA:1209,0 +DA:1210,0 +DA:1212,0 +DA:1214,0 +DA:1216,0 +DA:1217,0 +DA:1220,0 +DA:1221,0 +DA:1222,0 +DA:1226,0 +DA:1227,0 +DA:1228,0 +DA:1230,0 +DA:1231,0 +DA:1232,0 +DA:1233,0 +DA:1234,0 +DA:1237,0 +DA:1238,0 +DA:1239,0 +DA:1244,192 +DA:1245,192 +DA:1246,192 +DA:1248,192 +DA:1250,192 +DA:1251,192 +DA:1254,192 +DA:1255,0 +DA:1256,0 +DA:1260,192 +DA:1261,0 +DA:1262,0 +DA:1264,192 +DA:1265,192 +DA:1266,192 +DA:1267,0 +DA:1268,0 +DA:1273,0 +DA:1274,0 +DA:1275,0 +DA:1277,0 +DA:1279,0 +DA:1280,0 +DA:1283,0 +DA:1284,0 +DA:1285,0 +DA:1289,0 +DA:1290,0 +DA:1291,0 +DA:1293,0 +DA:1294,0 +DA:1295,0 +DA:1296,0 +DA:1297,0 +DA:1302,0 +DA:1307,0 +DA:1310,192 +DA:1311,192 +DA:1312,192 +DA:1313,192 +DA:1314,192 +DA:1315,192 +DA:1316,192 +DA:1317,96 +DA:1318,96 +DA:1319,96 +LF:722 +LH:252 +end_of_record +TN: +SF:/home/jeremy/Dev/libCEED/interface/ceed-jit-tools.c +FNL:0,124,296 +FNA:0,0,CeedLoadSourceToInitializedBuffer +FNL:1,27,61 +FNA:1,144,CeedCheckFilePath +FNL:2,313,324 +FNA:2,0,CeedLoadSourceAndInitializeBuffer +FNL:3,339,350 +FNA:3,0,CeedLoadSourceToBuffer +FNL:4,368,377 +FNA:4,48,CeedPathConcatenate +FNL:5,389,393 +FNA:5,1536,CeedGetJitRelativePath +FNL:6,406,439 +FNA:6,48,CeedGetJitAbsolutePath +FNL:7,74,107 +FNA:7,0,CeedNormalizePath +FNF:8 +FNH:4 +DA:27,144 +DA:31,144 +DA:33,144 +DA:34,144 +DA:36,144 +DA:37,144 +DA:39,0 +DA:43,144 +DA:44,144 +DA:48,144 +DA:49,144 +DA:51,144 +DA:53,96 +DA:54,96 +DA:55,96 +DA:59,144 +DA:60,144 +DA:74,0 +DA:75,0 +DA:77,0 +DA:79,0 +DA:80,0 +DA:81,0 +DA:84,0 +DA:85,0 +DA:87,0 +DA:88,0 +DA:89,0 +DA:92,0 +DA:93,0 +DA:95,0 +DA:96,0 +DA:98,0 +DA:99,0 +DA:100,0 +DA:101,0 +DA:104,0 +DA:106,0 +DA:124,0 +DA:126,0 +DA:130,0 +DA:131,0 +DA:132,0 +DA:135,0 +DA:136,0 +DA:138,0 +DA:139,0 +DA:140,0 +DA:143,0 +DA:146,0 +DA:147,0 +DA:150,0 +DA:157,0 +DA:160,0 +DA:162,0 +DA:164,0 +DA:165,0 +DA:167,0 +DA:168,0 +DA:171,0 +DA:172,0 +DA:173,0 +DA:176,0 +DA:178,0 +DA:179,0 +DA:180,0 +DA:183,0 +DA:184,0 +DA:186,0 +DA:187,0 +DA:188,0 +DA:189,0 +DA:191,0 +DA:195,0 +DA:197,0 +DA:198,0 +DA:201,0 +DA:202,0 +DA:203,0 +DA:206,0 +DA:208,0 +DA:209,0 +DA:211,0 +DA:212,0 +DA:213,0 +DA:214,0 +DA:216,0 +DA:217,0 +DA:218,0 +DA:219,0 +DA:220,0 +DA:221,0 +DA:222,0 +DA:223,0 +DA:224,0 +DA:225,0 +DA:227,0 +DA:229,0 +DA:232,0 +DA:233,0 +DA:234,0 +DA:236,0 +DA:237,0 +DA:238,0 +DA:239,0 +DA:241,0 +DA:243,0 +DA:245,0 +DA:246,0 +DA:247,0 +DA:248,0 +DA:253,0 +DA:254,0 +DA:255,0 +DA:256,0 +DA:257,0 +DA:258,0 +DA:259,0 +DA:260,0 +DA:262,0 +DA:263,0 +DA:264,0 +DA:265,0 +DA:267,0 +DA:268,0 +DA:269,0 +DA:270,0 +DA:272,0 +DA:275,0 +DA:278,0 +DA:279,0 +DA:281,0 +DA:282,0 +DA:283,0 +DA:284,0 +DA:287,0 +DA:290,0 +DA:291,0 +DA:292,0 +DA:293,0 +DA:294,0 +DA:295,0 +DA:313,0 +DA:315,0 +DA:316,0 +DA:319,0 +DA:322,0 +DA:323,0 +DA:339,0 +DA:340,0 +DA:341,0 +DA:344,0 +DA:347,0 +DA:348,0 +DA:349,0 +DA:368,48 +DA:369,48 +DA:370,48 +DA:371,48 +DA:373,48 +DA:374,48 +DA:375,48 +DA:376,48 +DA:389,1536 +DA:390,1536 +DA:391,1536 +DA:392,1536 +DA:406,48 +DA:411,48 +DA:412,48 +DA:413,48 +DA:415,48 +DA:416,48 +DA:420,48 +DA:421,48 +DA:424,96 +DA:425,48 +DA:427,48 +DA:428,48 +DA:429,48 +LF:180 +LH:41 +end_of_record +TN: +SF:/home/jeremy/Dev/libCEED/interface/ceed-object.c +FNL:0,108,111 +FNA:0,0,CeedObjectView +FNL:1,123,127 +FNA:1,0,CeedObjectSetNumViewTabs +FNL:2,139,142 +FNA:2,0,CeedObjectGetNumViewTabs +FNL:3,154,158 +FNA:3,9888,CeedObjectGetCeed +FNL:4,169,169 +FNA:4,11232,CeedObjectReturnCeed +FNL:5,180,183 +FNA:5,0,CeedObjectDestroy +FNL:6,37,45 +FNA:6,7200,CeedObjectCreate +FNL:7,56,59 +FNA:7,49140,CeedObjectReference +FNL:8,70,72 +FNA:8,55728,CeedObjectDereference +FNL:9,83,88 +FNA:9,6780,CeedObjectDestroy_Private +FNF:10 +FNH:6 +DA:37,7200 +DA:38,7200 +DA:39,7200 +DA:40,7200 +DA:41,7200 +DA:42,7200 +DA:43,7200 +DA:44,7200 +DA:56,49140 +DA:57,49140 +DA:58,49140 +DA:70,55728 +DA:71,55728 +DA:83,6780 +DA:84,6780 +DA:86,6780 +DA:87,6780 +DA:108,0 +DA:109,0 +DA:110,0 +DA:123,0 +DA:124,0 +DA:125,0 +DA:126,0 +DA:139,0 +DA:140,0 +DA:141,0 +DA:154,9888 +DA:155,9888 +DA:156,9888 +DA:157,9888 +DA:169,11232 +DA:180,0 +DA:181,0 +DA:182,0 +LF:35 +LH:22 +end_of_record +TN: +SF:/home/jeremy/Dev/libCEED/interface/ceed-operator.c +FNL:0,1049,1064 +FNA:0,1920,CeedOperatorGetFields +FNL:1,1079,1100 +FNA:1,0,CeedOperatorAtPointsSetPoints +FNL:2,1112,1115 +FNA:2,1536,CeedOperatorIsAtPoints +FNL:3,1130,1146 +FNA:3,0,CeedOperatorAtPointsGetPoints +FNL:4,1163,1185 +FNA:4,0,CeedOperatorGetFieldByName +FNL:5,1197,1200 +FNA:5,0,CeedOperatorFieldGetName +FNL:6,1214,1218 +FNA:6,10416,CeedOperatorFieldGetElemRestriction +FNL:7,1232,1236 +FNA:7,5376,CeedOperatorFieldGetBasis +FNL:8,1250,1254 +FNA:8,7776,CeedOperatorFieldGetVector +FNL:9,1273,1279 +FNA:9,0,CeedOperatorFieldGetData +FNL:10,1291,1319 +FNA:10,0,CeedOperatorCompositeAddSub +FNL:11,1331,1338 +FNA:11,0,CeedOperatorCompositeGetNumSub +FNL:12,1350,1357 +FNA:12,0,CeedOperatorCompositeGetSubList +FNL:13,1374,1391 +FNA:13,0,CeedOperatorCompositeGetSubByName +FNL:14,138,175 +FNA:14,0,CeedOperatorSingleView +FNL:15,1402,1450 +FNA:15,3072,CeedOperatorCheckReady +FNL:16,1465,1494 +FNA:16,0,CeedOperatorGetActiveVectorLengths +FNL:17,1509,1524 +FNA:17,0,CeedOperatorSetQFunctionAssemblyReuse +FNL:18,1536,1556 +FNA:18,0,CeedOperatorSetQFunctionAssemblyDataUpdateNeeded +FNL:19,1568,1579 +FNA:19,0,CeedOperatorSetName +FNL:20,1591,1602 +FNA:20,0,CeedOperatorGetName +FNL:21,1615,1652 +FNA:21,0,CeedOperatorView_Core +FNL:22,1664,1667 +FNA:22,0,CeedOperatorSetNumViewTabs +FNL:23,1679,1682 +FNA:23,0,CeedOperatorGetNumViewTabs +FNL:24,1694,1697 +FNA:24,0,CeedOperatorView +FNL:25,1709,1712 +FNA:25,0,CeedOperatorViewTerse +FNL:26,1724,1727 +FNA:26,1536,CeedOperatorGetCeed +FNL:27,1738,1738 +FNA:27,1152,CeedOperatorReturnCeed +FNL:28,1750,1757 +FNA:28,384,CeedOperatorGetNumElements +FNL:29,1769,1776 +FNA:29,768,CeedOperatorGetNumQuadraturePoints +FNL:30,1786,1907 +FNA:30,0,CeedOperatorGetFlopsEstimate +FNL:31,187,190 +FNA:31,0,CeedOperatorView_Object +FNL:32,1924,1937 +FNA:32,0,CeedOperatorGetContext +FNL:33,1952,2051 +FNA:33,0,CeedOperatorGetContextFieldLabel +FNL:34,201,204 +FNA:34,0,CeedOperatorDestroy_Object +FNL:35,2066,2068 +FNA:35,0,CeedOperatorSetContextDouble +FNL:36,2084,2086 +FNA:36,0,CeedOperatorGetContextDoubleRead +FNL:37,2099,2101 +FNA:37,0,CeedOperatorRestoreContextDoubleRead +FNL:38,2116,2118 +FNA:38,0,CeedOperatorSetContextInt32 +FNL:39,2134,2136 +FNA:39,0,CeedOperatorGetContextInt32Read +FNL:40,2149,2151 +FNA:40,0,CeedOperatorRestoreContextInt32Read +FNL:41,2166,2168 +FNA:41,0,CeedOperatorSetContextBoolean +FNL:42,218,221 +FNA:42,0,CeedOperatorGetActiveBasis +FNL:43,2184,2186 +FNA:43,0,CeedOperatorGetContextBooleanRead +FNL:44,2199,2201 +FNA:44,0,CeedOperatorRestoreContextBooleanRead +FNL:45,2220,2241 +FNA:45,384,CeedOperatorApply +FNL:46,2262,2287 +FNA:46,384,CeedOperatorApplyAdd +FNL:47,2306,2354 +FNA:47,384,CeedOperatorApplyAddActive +FNL:48,236,287 +FNA:48,0,CeedOperatorGetActiveBases +FNL:49,2365,2383 +FNA:49,384,CeedOperatorAssemblyDataStrip +FNL:50,2394,2466 +FNA:50,768,CeedOperatorDestroy +FNL:51,301,304 +FNA:51,0,CeedOperatorGetActiveElemRestriction +FNL:52,319,370 +FNA:52,0,CeedOperatorGetActiveElemRestrictions +FNL:53,36,82 +FNA:53,1152,CeedOperatorCheckField +FNL:54,387,432 +FNA:54,0,CeedOperatorContextSetGeneric +FNL:55,450,500 +FNA:55,0,CeedOperatorContextGetGenericRead +FNL:56,517,563 +FNA:56,0,CeedOperatorContextRestoreGenericRead +FNL:57,583,590 +FNA:57,0,CeedOperatorGetNumArgs +FNL:58,604,633 +FNA:58,0,CeedOperatorHasTensorBases +FNL:59,645,648 +FNA:59,1152,CeedOperatorIsImmutable +FNL:60,660,663 +FNA:60,384,CeedOperatorIsSetupDone +FNL:61,675,683 +FNA:61,4224,CeedOperatorGetQFunction +FNL:62,695,698 +FNA:62,10368,CeedOperatorIsComposite +FNL:63,710,713 +FNA:63,1152,CeedOperatorGetData +FNL:64,725,728 +FNA:64,384,CeedOperatorSetData +FNL:65,739,742 +FNA:65,0,CeedOperatorReference +FNL:66,753,756 +FNA:66,384,CeedOperatorSetSetupDone +FNL:67,781,805 +FNA:67,576,CeedOperatorCreate +FNL:68,823,848 +FNA:68,0,CeedOperatorCreateAtPoints +FNL:69,860,881 +FNA:69,0,CeedOperatorCreateComposite +FNL:70,898,903 +FNA:70,0,CeedOperatorReferenceCopy +FNL:71,928,1032 +FNA:71,1152,CeedOperatorSetField +FNL:72,98,125 +FNA:72,0,CeedOperatorFieldView +FNF:73 +FNH:25 +DA:36,1152 +DA:38,1152 +DA:42,1152 +DA:45,1152 +DA:47,1152 +DA:48,960 +DA:51,1152 +DA:53,1152 +DA:54,768 +DA:55,768 +DA:56,768 +DA:57,768 +DA:63,1152 +DA:64,384 +DA:65,384 +DA:68,384 +DA:69,576 +DA:73,576 +DA:76,576 +DA:77,192 +DA:79,192 +DA:81,1152 +DA:98,0 +DA:100,0 +DA:108,0 +DA:109,0 +DA:111,0 +DA:116,0 +DA:117,0 +DA:118,0 +DA:119,0 +DA:120,0 +DA:122,0 +DA:123,0 +DA:124,0 +DA:138,0 +DA:140,0 +DA:145,0 +DA:146,0 +DA:147,0 +DA:148,0 +DA:149,0 +DA:150,0 +DA:151,0 +DA:152,0 +DA:154,0 +DA:155,0 +DA:158,0 +DA:159,0 +DA:160,0 +DA:161,0 +DA:163,0 +DA:165,0 +DA:166,0 +DA:167,0 +DA:168,0 +DA:170,0 +DA:171,0 +DA:172,0 +DA:174,0 +DA:187,0 +DA:188,0 +DA:189,0 +DA:201,0 +DA:202,0 +DA:203,0 +DA:218,0 +DA:219,0 +DA:220,0 +DA:236,0 +DA:241,0 +DA:242,0 +DA:244,0 +DA:245,0 +DA:246,0 +DA:247,0 +DA:250,0 +DA:251,0 +DA:254,0 +DA:255,0 +DA:257,0 +DA:258,0 +DA:260,0 +DA:262,0 +DA:265,0 +DA:266,0 +DA:267,0 +DA:268,0 +DA:271,0 +DA:272,0 +DA:275,0 +DA:276,0 +DA:278,0 +DA:279,0 +DA:281,0 +DA:283,0 +DA:286,0 +DA:301,0 +DA:302,0 +DA:303,0 +DA:319,0 +DA:324,0 +DA:325,0 +DA:327,0 +DA:328,0 +DA:329,0 +DA:330,0 +DA:333,0 +DA:334,0 +DA:337,0 +DA:338,0 +DA:340,0 +DA:341,0 +DA:343,0 +DA:345,0 +DA:348,0 +DA:349,0 +DA:350,0 +DA:351,0 +DA:354,0 +DA:355,0 +DA:358,0 +DA:359,0 +DA:361,0 +DA:362,0 +DA:364,0 +DA:366,0 +DA:369,0 +DA:387,0 +DA:388,0 +DA:390,0 +DA:393,0 +DA:394,0 +DA:396,0 +DA:397,0 +DA:399,0 +DA:402,0 +DA:403,0 +DA:407,0 +DA:408,0 +DA:409,0 +DA:412,0 +DA:415,0 +DA:417,0 +DA:418,0 +DA:420,0 +DA:425,0 +DA:426,0 +DA:427,0 +DA:428,0 +DA:430,0 +DA:431,0 +DA:450,0 +DA:452,0 +DA:454,0 +DA:456,0 +DA:457,0 +DA:460,0 +DA:461,0 +DA:463,0 +DA:464,0 +DA:466,0 +DA:469,0 +DA:470,0 +DA:474,0 +DA:475,0 +DA:476,0 +DA:479,0 +DA:482,0 +DA:484,0 +DA:485,0 +DA:486,0 +DA:487,0 +DA:489,0 +DA:494,0 +DA:495,0 +DA:496,0 +DA:497,0 +DA:499,0 +DA:517,0 +DA:518,0 +DA:520,0 +DA:523,0 +DA:524,0 +DA:526,0 +DA:527,0 +DA:529,0 +DA:532,0 +DA:533,0 +DA:537,0 +DA:538,0 +DA:539,0 +DA:542,0 +DA:545,0 +DA:547,0 +DA:548,0 +DA:549,0 +DA:550,0 +DA:552,0 +DA:557,0 +DA:558,0 +DA:559,0 +DA:560,0 +DA:562,0 +DA:583,0 +DA:586,0 +DA:587,0 +DA:588,0 +DA:589,0 +DA:604,0 +DA:608,0 +DA:609,0 +DA:610,0 +DA:614,0 +DA:615,0 +DA:616,0 +DA:617,0 +DA:619,0 +DA:621,0 +DA:625,0 +DA:626,0 +DA:627,0 +DA:628,0 +DA:630,0 +DA:632,0 +DA:645,1152 +DA:646,1152 +DA:647,1152 +DA:660,384 +DA:661,384 +DA:662,384 +DA:675,4224 +DA:678,4224 +DA:679,4224 +DA:680,4224 +DA:681,4224 +DA:682,4224 +DA:695,10368 +DA:696,10368 +DA:697,10368 +DA:710,1152 +DA:711,1152 +DA:712,1152 +DA:725,384 +DA:726,384 +DA:727,384 +DA:739,0 +DA:740,0 +DA:741,0 +DA:753,384 +DA:754,384 +DA:755,384 +DA:781,576 +DA:782,576 +DA:785,192 +DA:786,192 +DA:787,192 +DA:788,192 +DA:789,192 +DA:792,384 +DA:794,384 +DA:795,384 +DA:796,384 +DA:797,384 +DA:798,384 +DA:799,384 +DA:800,384 +DA:801,384 +DA:802,384 +DA:803,384 +DA:804,384 +DA:823,0 +DA:824,0 +DA:827,0 +DA:828,0 +DA:829,0 +DA:830,0 +DA:831,0 +DA:834,0 +DA:836,0 +DA:837,0 +DA:838,0 +DA:839,0 +DA:840,0 +DA:841,0 +DA:842,0 +DA:843,0 +DA:844,0 +DA:845,0 +DA:846,0 +DA:847,0 +DA:860,0 +DA:861,0 +DA:864,0 +DA:865,0 +DA:866,0 +DA:867,0 +DA:868,0 +DA:872,0 +DA:873,0 +DA:874,0 +DA:875,0 +DA:876,0 +DA:877,0 +DA:879,0 +DA:880,0 +DA:898,0 +DA:899,0 +DA:900,0 +DA:901,0 +DA:902,0 +DA:928,1152 +DA:929,1152 +DA:930,1152 +DA:935,1152 +DA:936,1152 +DA:937,1152 +DA:938,1152 +DA:939,1152 +DA:940,1152 +DA:941,1152 +DA:942,1152 +DA:944,1152 +DA:945,1152 +DA:950,1152 +DA:951,1152 +DA:952,0 +DA:954,0 +DA:956,0 +DA:957,0 +DA:961,0 +DA:962,0 +DA:968,1152 +DA:969,768 +DA:970,1152 +DA:975,1152 +DA:976,1152 +DA:977,1152 +DA:978,2304 +DA:981,1920 +DA:982,1920 +DA:983,768 +DA:984,768 +DA:985,768 +DA:988,384 +DA:989,384 +DA:992,384 +DA:993,384 +DA:994,384 +DA:995,384 +DA:996,384 +DA:1002,1152 +DA:1003,1152 +DA:1004,1152 +DA:1006,1152 +DA:1009,768 +DA:1010,768 +DA:1011,384 +DA:1012,384 +DA:1015,384 +DA:1016,384 +DA:1021,1152 +DA:1022,1152 +DA:1023,1152 +DA:1024,384 +DA:1025,384 +DA:1027,1152 +DA:1028,1152 +DA:1029,1152 +DA:1030,1152 +DA:1031,1152 +DA:1049,1920 +DA:1054,1920 +DA:1055,1920 +DA:1056,1920 +DA:1058,1920 +DA:1059,1920 +DA:1060,1920 +DA:1061,1920 +DA:1062,1920 +DA:1063,1920 +DA:1079,0 +DA:1082,0 +DA:1083,0 +DA:1084,0 +DA:1085,0 +DA:1087,0 +DA:1088,0 +DA:1092,0 +DA:1093,0 +DA:1097,0 +DA:1098,0 +DA:1099,0 +DA:1112,1536 +DA:1113,1536 +DA:1114,1536 +DA:1130,0 +DA:1133,0 +DA:1134,0 +DA:1135,0 +DA:1137,0 +DA:1138,0 +DA:1139,0 +DA:1141,0 +DA:1142,0 +DA:1143,0 +DA:1145,0 +DA:1163,0 +DA:1168,0 +DA:1169,0 +DA:1170,0 +DA:1171,0 +DA:1172,0 +DA:1173,0 +DA:1174,0 +DA:1177,0 +DA:1178,0 +DA:1179,0 +DA:1180,0 +DA:1181,0 +DA:1184,0 +DA:1197,0 +DA:1198,0 +DA:1199,0 +DA:1214,10416 +DA:1215,10416 +DA:1216,10416 +DA:1217,10416 +DA:1232,5376 +DA:1233,5376 +DA:1234,5376 +DA:1235,5376 +DA:1250,7776 +DA:1251,7776 +DA:1252,7776 +DA:1253,7776 +DA:1273,0 +DA:1274,0 +DA:1275,0 +DA:1276,0 +DA:1277,0 +DA:1278,0 +DA:1291,0 +DA:1294,0 +DA:1295,0 +DA:1297,0 +DA:1298,0 +DA:1303,0 +DA:1304,0 +DA:1305,0 +DA:1307,0 +DA:1315,0 +DA:1316,0 +DA:1317,0 +DA:1318,0 +DA:1331,0 +DA:1334,0 +DA:1335,0 +DA:1336,0 +DA:1337,0 +DA:1350,0 +DA:1353,0 +DA:1354,0 +DA:1355,0 +DA:1356,0 +DA:1374,0 +DA:1379,0 +DA:1380,0 +DA:1381,0 +DA:1382,0 +DA:1383,0 +DA:1384,0 +DA:1385,0 +DA:1386,0 +DA:1387,0 +DA:1390,0 +DA:1402,3072 +DA:1404,3072 +DA:1406,3072 +DA:1408,384 +DA:1409,384 +DA:1410,384 +DA:1411,384 +DA:1414,0 +DA:1415,0 +DA:1417,0 +DA:1418,0 +DA:1422,0 +DA:1423,0 +DA:1424,0 +DA:1429,0 +DA:1434,384 +DA:1435,384 +DA:1436,384 +DA:1438,384 +DA:1439,384 +DA:1444,384 +DA:1445,384 +DA:1446,384 +DA:1447,384 +DA:1448,384 +DA:1449,384 +DA:1465,0 +DA:1468,0 +DA:1469,0 +DA:1471,0 +DA:1472,0 +DA:1476,0 +DA:1477,0 +DA:1478,0 +DA:1481,0 +DA:1482,0 +DA:1483,0 +DA:1485,0 +DA:1493,0 +DA:1509,0 +DA:1512,0 +DA:1513,0 +DA:1514,0 +DA:1515,0 +DA:1520,0 +DA:1521,0 +DA:1523,0 +DA:1536,0 +DA:1539,0 +DA:1540,0 +DA:1544,0 +DA:1545,0 +DA:1546,0 +DA:1547,0 +DA:1552,0 +DA:1553,0 +DA:1555,0 +DA:1568,0 +DA:1570,0 +DA:1572,0 +DA:1573,0 +DA:1574,0 +DA:1575,0 +DA:1576,0 +DA:1578,0 +DA:1591,0 +DA:1592,0 +DA:1593,0 +DA:1594,0 +DA:1597,0 +DA:1598,0 +DA:1599,0 +DA:1601,0 +DA:1615,0 +DA:1617,0 +DA:1618,0 +DA:1619,0 +DA:1621,0 +DA:1622,0 +DA:1623,0 +DA:1624,0 +DA:1626,0 +DA:1627,0 +DA:1628,0 +DA:1629,0 +DA:1633,0 +DA:1634,0 +DA:1635,0 +DA:1636,0 +DA:1637,0 +DA:1638,0 +DA:1639,0 +DA:1640,0 +DA:1641,0 +DA:1642,0 +DA:1643,0 +DA:1646,0 +DA:1647,0 +DA:1648,0 +DA:1650,0 +DA:1651,0 +DA:1664,0 +DA:1665,0 +DA:1666,0 +DA:1679,0 +DA:1680,0 +DA:1681,0 +DA:1694,0 +DA:1695,0 +DA:1696,0 +DA:1709,0 +DA:1710,0 +DA:1711,0 +DA:1724,1536 +DA:1725,1536 +DA:1726,1536 +DA:1738,1152 +DA:1750,384 +DA:1753,384 +DA:1754,384 +DA:1755,384 +DA:1756,384 +DA:1769,768 +DA:1772,768 +DA:1773,768 +DA:1774,768 +DA:1775,768 +DA:1786,0 +DA:1789,0 +DA:1791,0 +DA:1792,0 +DA:1793,0 +DA:1796,0 +DA:1798,0 +DA:1801,0 +DA:1804,0 +DA:1805,0 +DA:1809,0 +DA:1814,0 +DA:1815,0 +DA:1816,0 +DA:1817,0 +DA:1819,0 +DA:1821,0 +DA:1822,0 +DA:1823,0 +DA:1825,0 +DA:1827,0 +DA:1828,0 +DA:1829,0 +DA:1831,0 +DA:1832,0 +DA:1834,0 +DA:1836,0 +DA:1838,0 +DA:1839,0 +DA:1840,0 +DA:1841,0 +DA:1844,0 +DA:1847,0 +DA:1848,0 +DA:1854,0 +DA:1855,0 +DA:1856,0 +DA:1857,0 +DA:1858,0 +DA:1859,0 +DA:1860,0 +DA:1861,0 +DA:1862,0 +DA:1864,0 +DA:1872,0 +DA:1873,0 +DA:1874,0 +DA:1875,0 +DA:1876,0 +DA:1877,0 +DA:1879,0 +DA:1883,0 +DA:1886,0 +DA:1887,0 +DA:1893,0 +DA:1894,0 +DA:1895,0 +DA:1896,0 +DA:1897,0 +DA:1898,0 +DA:1899,0 +DA:1900,0 +DA:1901,0 +DA:1903,0 +DA:1906,0 +DA:1924,0 +DA:1929,0 +DA:1930,0 +DA:1931,0 +DA:1932,0 +DA:1933,0 +DA:1934,0 +DA:1935,0 +DA:1936,0 +DA:1952,0 +DA:1953,0 +DA:1955,0 +DA:1957,0 +DA:1960,0 +DA:1961,0 +DA:1962,0 +DA:1963,0 +DA:1972,0 +DA:1973,0 +DA:1974,0 +DA:1975,0 +DA:1976,0 +DA:1978,0 +DA:1979,0 +DA:1982,0 +DA:1983,0 +DA:1984,0 +DA:1985,0 +DA:1986,0 +DA:1987,0 +DA:1988,0 +DA:1995,0 +DA:1997,0 +DA:2005,0 +DA:2011,0 +DA:2012,0 +DA:2025,0 +DA:2026,0 +DA:2027,0 +DA:2028,0 +DA:2029,0 +DA:2031,0 +DA:2036,0 +DA:2037,0 +DA:2040,0 +DA:2041,0 +DA:2042,0 +DA:2043,0 +DA:2044,0 +DA:2045,0 +DA:2047,0 +DA:2048,0 +DA:2050,0 +DA:2066,0 +DA:2067,0 +DA:2084,0 +DA:2085,0 +DA:2099,0 +DA:2100,0 +DA:2116,0 +DA:2117,0 +DA:2134,0 +DA:2135,0 +DA:2149,0 +DA:2150,0 +DA:2166,0 +DA:2167,0 +DA:2184,0 +DA:2185,0 +DA:2199,0 +DA:2200,0 +DA:2220,384 +DA:2223,384 +DA:2225,384 +DA:2226,384 +DA:2228,0 +DA:2229,384 +DA:2231,0 +DA:2235,384 +DA:2238,384 +DA:2240,384 +DA:2262,384 +DA:2265,384 +DA:2267,384 +DA:2268,384 +DA:2270,0 +DA:2271,0 +DA:2276,0 +DA:2277,0 +DA:2278,0 +DA:2279,0 +DA:2282,384 +DA:2284,384 +DA:2286,384 +DA:2306,384 +DA:2309,384 +DA:2311,384 +DA:2312,384 +DA:2317,0 +DA:2318,0 +DA:2321,0 +DA:2325,0 +DA:2326,0 +DA:2329,0 +DA:2330,0 +DA:2331,0 +DA:2335,0 +DA:2341,384 +DA:2343,768 +DA:2346,384 +DA:2347,384 +DA:2348,384 +DA:2351,384 +DA:2353,384 +DA:2365,384 +DA:2368,384 +DA:2369,384 +DA:2370,384 +DA:2371,384 +DA:2375,0 +DA:2376,0 +DA:2377,0 +DA:2378,0 +DA:2379,0 +DA:2382,384 +DA:2394,768 +DA:2395,768 +DA:2396,384 +DA:2397,384 +DA:2400,384 +DA:2401,384 +DA:2404,1536 +DA:2405,1152 +DA:2406,768 +DA:2407,576 +DA:2409,768 +DA:2410,576 +DA:2412,768 +DA:2413,192 +DA:2415,768 +DA:2416,768 +DA:2419,1536 +DA:2420,1152 +DA:2421,384 +DA:2422,384 +DA:2423,192 +DA:2425,384 +DA:2426,0 +DA:2428,384 +DA:2429,384 +DA:2432,384 +DA:2433,384 +DA:2435,384 +DA:2436,384 +DA:2437,384 +DA:2439,384 +DA:2441,384 +DA:2442,0 +DA:2443,0 +DA:2446,384 +DA:2447,384 +DA:2448,384 +DA:2449,384 +DA:2451,384 +DA:2452,0 +DA:2453,0 +DA:2454,0 +DA:2457,384 +DA:2460,384 +DA:2462,384 +DA:2463,384 +DA:2464,384 +DA:2465,384 +LF:841 +LH:250 +end_of_record +TN: +SF:/home/jeremy/Dev/libCEED/interface/ceed-preconditioning.c +FNL:0,1011,1215 +FNA:0,0,CeedOperatorMultigridLevelCreateSingle_Core +FNL:1,105,202 +FNA:1,0,CeedOperatorCreateFallback +FNL:2,1234,1255 +FNA:2,0,CeedBuildMassLaplace +FNL:3,1276,1298 +FNA:3,0,CeedOperatorGetBasisPointer +FNL:4,1310,1340 +FNA:4,0,CeedOperatorCreateActivePointBlockRestriction +FNL:5,1352,1361 +FNA:5,0,CeedOperatorGetQFunctionAssemblyData +FNL:6,1373,1378 +FNA:6,0,CeedQFunctionAssemblyDataCreate +FNL:7,1389,1392 +FNA:7,0,CeedQFunctionAssemblyDataReference +FNL:8,1404,1408 +FNA:8,0,CeedQFunctionAssemblyDataSetReuse +FNL:9,1420,1423 +FNA:9,0,CeedQFunctionAssemblyDataSetUpdateNeeded +FNL:10,1435,1438 +FNA:10,0,CeedQFunctionAssemblyDataIsUpdateNeeded +FNL:11,1455,1460 +FNA:11,0,CeedQFunctionAssemblyDataReferenceCopy +FNL:12,1472,1475 +FNA:12,0,CeedQFunctionAssemblyDataIsSetup +FNL:13,1488,1494 +FNA:13,0,CeedQFunctionAssemblyDataSetObjects +FNL:14,1507,1513 +FNA:14,0,CeedQFunctionAssemblyDataGetObjects +FNL:15,1524,1535 +FNA:15,384,CeedQFunctionAssemblyDataDestroy +FNL:16,1547,1556 +FNA:16,0,CeedOperatorGetOperatorAssemblyData +FNL:17,1576,1721 +FNA:17,0,CeedOperatorAssemblyDataCreate +FNL:18,1744,1758 +FNA:18,0,CeedOperatorAssemblyDataGetEvalModes +FNL:19,1777,1885 +FNA:19,0,CeedOperatorAssemblyDataGetBases +FNL:20,1902,1910 +FNA:20,0,CeedOperatorAssemblyDataGetElemRestrictions +FNL:21,1921,1956 +FNA:21,384,CeedOperatorAssemblyDataDestroy +FNL:22,1968,1995 +FNA:22,0,CeedOperatorGetFallback +FNL:23,2007,2010 +FNA:23,0,CeedOperatorGetFallbackParent +FNL:24,2022,2027 +FNA:24,0,CeedOperatorGetFallbackParentCeed +FNL:25,2057,2073 +FNA:25,0,CeedOperatorLinearAssembleQFunction +FNL:26,2094,2096 +FNA:26,0,CeedOperatorLinearAssembleQFunctionBuildOrUpdate +FNL:27,2115,2162 +FNA:27,0,CeedOperatorLinearAssembleDiagonal +FNL:28,216,387 +FNA:28,0,CeedOperatorLinearAssembleAddDiagonalSingle_Mesh +FNL:29,2181,2221 +FNA:29,0,CeedOperatorLinearAssembleAddDiagonal +FNL:30,2243,2325 +FNA:30,0,CeedOperatorLinearAssemblePointBlockDiagonalSymbolic +FNL:31,2346,2388 +FNA:31,0,CeedOperatorLinearAssemblePointBlockDiagonal +FNL:32,2409,2449 +FNA:32,0,CeedOperatorLinearAssembleAddPointBlockDiagonal +FNL:33,2471,2516 +FNA:33,0,CeedOperatorLinearAssembleSymbolic +FNL:34,2536,2588 +FNA:34,0,CeedOperatorLinearAssemble +FNL:35,2604,2662 +FNA:35,0,CeedOperatorCompositeGetMultiplicity +FNL:36,2681,2700 +FNA:36,0,CeedOperatorMultigridLevelCreate +FNL:37,2720,2767 +FNA:37,0,CeedOperatorMultigridLevelCreateTensorH1 +FNL:38,2787,2832 +FNA:38,0,CeedOperatorMultigridLevelCreateH1 +FNL:39,2853,3076 +FNA:39,0,CeedOperatorCreateFDMElementInverse +FNL:40,37,94 +FNA:40,0,CeedQFunctionCreateFallback +FNL:41,401,409 +FNA:41,0,CeedOperatorLinearAssembleAddDiagonalSingle +FNL:42,423,438 +FNA:42,0,CeedOperatorLinearAssembleAddDiagonalComposite +FNL:43,454,552 +FNA:43,0,CeedOperatorAssembleSymbolicSingle +FNL:44,574,632 +FNA:44,0,CeedOperatorLinearAssembleQFunctionBuildOrUpdate_Core +FNL:45,653,656 +FNA:45,0,CeedOperatorLinearAssembleQFunctionBuildOrUpdateFallback +FNL:46,671,916 +FNA:46,0,CeedOperatorAssembleSingle +FNL:47,928,957 +FNA:47,0,CeedOperatorAssemblyCountEntriesSingle +FNL:48,969,993 +FNA:48,0,CeedOperatorLinearAssembleGetNumEntries +FNF:49 +FNH:2 +DA:37,0 +DA:38,0 +DA:43,0 +DA:45,0 +DA:47,0 +DA:48,0 +DA:50,0 +DA:51,0 +DA:52,0 +DA:53,0 +DA:54,0 +DA:55,0 +DA:57,0 +DA:64,0 +DA:65,0 +DA:66,0 +DA:71,0 +DA:72,0 +DA:73,0 +DA:75,0 +DA:76,0 +DA:81,0 +DA:82,0 +DA:84,0 +DA:89,0 +DA:90,0 +DA:92,0 +DA:93,0 +DA:105,0 +DA:111,0 +DA:114,0 +DA:115,0 +DA:116,0 +DA:117,0 +DA:119,0 +DA:122,0 +DA:123,0 +DA:127,0 +DA:128,0 +DA:129,0 +DA:130,0 +DA:133,0 +DA:134,0 +DA:137,0 +DA:139,0 +DA:142,0 +DA:143,0 +DA:144,0 +DA:145,0 +DA:146,0 +DA:150,0 +DA:151,0 +DA:152,0 +DA:153,0 +DA:154,0 +DA:156,0 +DA:158,0 +DA:159,0 +DA:165,0 +DA:166,0 +DA:167,0 +DA:168,0 +DA:169,0 +DA:171,0 +DA:177,0 +DA:178,0 +DA:179,0 +DA:180,0 +DA:181,0 +DA:186,0 +DA:187,0 +DA:190,0 +DA:191,0 +DA:192,0 +DA:194,0 +DA:195,0 +DA:198,0 +DA:199,0 +DA:200,0 +DA:201,0 +DA:216,0 +DA:220,0 +DA:221,0 +DA:226,0 +DA:227,0 +DA:229,0 +DA:230,0 +DA:231,0 +DA:232,0 +DA:242,0 +DA:243,0 +DA:246,0 +DA:247,0 +DA:250,0 +DA:252,0 +DA:253,0 +DA:257,0 +DA:258,0 +DA:259,0 +DA:260,0 +DA:261,0 +DA:264,0 +DA:265,0 +DA:268,0 +DA:269,0 +DA:270,0 +DA:271,0 +DA:274,0 +DA:275,0 +DA:278,0 +DA:282,0 +DA:283,0 +DA:285,0 +DA:289,0 +DA:292,0 +DA:293,0 +DA:294,0 +DA:295,0 +DA:296,0 +DA:297,0 +DA:298,0 +DA:301,0 +DA:302,0 +DA:304,0 +DA:305,0 +DA:307,0 +DA:308,0 +DA:309,0 +DA:314,0 +DA:316,0 +DA:317,0 +DA:319,0 +DA:320,0 +DA:321,0 +DA:322,0 +DA:324,0 +DA:325,0 +DA:326,0 +DA:327,0 +DA:328,0 +DA:330,0 +DA:332,0 +DA:333,0 +DA:335,0 +DA:336,0 +DA:337,0 +DA:338,0 +DA:339,0 +DA:341,0 +DA:344,0 +DA:346,0 +DA:347,0 +DA:349,0 +DA:350,0 +DA:351,0 +DA:352,0 +DA:354,0 +DA:355,0 +DA:356,0 +DA:361,0 +DA:362,0 +DA:363,0 +DA:365,0 +DA:366,0 +DA:374,0 +DA:377,0 +DA:380,0 +DA:381,0 +DA:382,0 +DA:384,0 +DA:385,0 +DA:386,0 +DA:401,0 +DA:405,0 +DA:406,0 +DA:407,0 +DA:408,0 +DA:423,0 +DA:428,0 +DA:429,0 +DA:430,0 +DA:431,0 +DA:432,0 +DA:434,0 +DA:437,0 +DA:454,0 +DA:457,0 +DA:465,0 +DA:466,0 +DA:467,0 +DA:469,0 +DA:470,0 +DA:471,0 +DA:472,0 +DA:473,0 +DA:474,0 +DA:477,0 +DA:478,0 +DA:479,0 +DA:480,0 +DA:481,0 +DA:482,0 +DA:483,0 +DA:484,0 +DA:485,0 +DA:486,0 +DA:487,0 +DA:489,0 +DA:490,0 +DA:491,0 +DA:495,0 +DA:496,0 +DA:497,0 +DA:500,0 +DA:501,0 +DA:502,0 +DA:503,0 +DA:504,0 +DA:505,0 +DA:506,0 +DA:507,0 +DA:508,0 +DA:509,0 +DA:510,0 +DA:512,0 +DA:513,0 +DA:514,0 +DA:515,0 +DA:516,0 +DA:517,0 +DA:518,0 +DA:520,0 +DA:523,0 +DA:524,0 +DA:525,0 +DA:526,0 +DA:527,0 +DA:528,0 +DA:529,0 +DA:530,0 +DA:531,0 +DA:533,0 +DA:534,0 +DA:535,0 +DA:541,0 +DA:542,0 +DA:543,0 +DA:544,0 +DA:545,0 +DA:546,0 +DA:548,0 +DA:549,0 +DA:550,0 +DA:551,0 +DA:574,0 +DA:576,0 +DA:577,0 +DA:578,0 +DA:580,0 +DA:583,0 +DA:584,0 +DA:586,0 +DA:587,0 +DA:588,0 +DA:589,0 +DA:591,0 +DA:592,0 +DA:596,0 +DA:600,0 +DA:601,0 +DA:603,0 +DA:604,0 +DA:605,0 +DA:608,0 +DA:609,0 +DA:610,0 +DA:612,0 +DA:613,0 +DA:615,0 +DA:618,0 +DA:619,0 +DA:620,0 +DA:621,0 +DA:626,0 +DA:627,0 +DA:628,0 +DA:629,0 +DA:631,0 +DA:653,0 +DA:655,0 +DA:671,0 +DA:674,0 +DA:675,0 +DA:679,0 +DA:681,0 +DA:682,0 +DA:685,0 +DA:687,0 +DA:688,0 +DA:693,0 +DA:694,0 +DA:695,0 +DA:696,0 +DA:697,0 +DA:701,0 +DA:702,0 +DA:708,0 +DA:709,0 +DA:711,0 +DA:712,0 +DA:713,0 +DA:714,0 +DA:719,0 +DA:726,0 +DA:727,0 +DA:730,0 +DA:731,0 +DA:734,0 +DA:736,0 +DA:739,0 +DA:740,0 +DA:741,0 +DA:742,0 +DA:743,0 +DA:744,0 +DA:746,0 +DA:747,0 +DA:748,0 +DA:749,0 +DA:750,0 +DA:752,0 +DA:753,0 +DA:754,0 +DA:755,0 +DA:756,0 +DA:759,0 +DA:760,0 +DA:761,0 +DA:765,0 +DA:766,0 +DA:767,0 +DA:768,0 +DA:769,0 +DA:774,0 +DA:775,0 +DA:776,0 +DA:777,0 +DA:778,0 +DA:781,0 +DA:782,0 +DA:783,0 +DA:784,0 +DA:786,0 +DA:787,0 +DA:789,0 +DA:794,0 +DA:796,0 +DA:797,0 +DA:798,0 +DA:799,0 +DA:801,0 +DA:802,0 +DA:803,0 +DA:804,0 +DA:806,0 +DA:807,0 +DA:808,0 +DA:809,0 +DA:810,0 +DA:812,0 +DA:813,0 +DA:814,0 +DA:815,0 +DA:817,0 +DA:819,0 +DA:825,0 +DA:826,0 +DA:831,0 +DA:832,0 +DA:833,0 +DA:837,0 +DA:838,0 +DA:840,0 +DA:841,0 +DA:843,0 +DA:844,0 +DA:847,0 +DA:848,0 +DA:851,0 +DA:852,0 +DA:853,0 +DA:854,0 +DA:855,0 +DA:856,0 +DA:860,0 +DA:861,0 +DA:863,0 +DA:864,0 +DA:865,0 +DA:868,0 +DA:869,0 +DA:872,0 +DA:873,0 +DA:874,0 +DA:875,0 +DA:876,0 +DA:877,0 +DA:883,0 +DA:884,0 +DA:885,0 +DA:886,0 +DA:892,0 +DA:893,0 +DA:896,0 +DA:897,0 +DA:898,0 +DA:899,0 +DA:900,0 +DA:901,0 +DA:902,0 +DA:904,0 +DA:905,0 +DA:906,0 +DA:907,0 +DA:908,0 +DA:911,0 +DA:912,0 +DA:913,0 +DA:914,0 +DA:915,0 +DA:928,0 +DA:933,0 +DA:934,0 +DA:936,0 +DA:937,0 +DA:938,0 +DA:939,0 +DA:940,0 +DA:941,0 +DA:942,0 +DA:946,0 +DA:947,0 +DA:949,0 +DA:950,0 +DA:951,0 +DA:953,0 +DA:954,0 +DA:955,0 +DA:956,0 +DA:969,0 +DA:972,0 +DA:973,0 +DA:975,0 +DA:979,0 +DA:980,0 +DA:982,0 +DA:983,0 +DA:986,0 +DA:987,0 +DA:990,0 +DA:992,0 +DA:1011,0 +DA:1017,0 +DA:1018,0 +DA:1021,0 +DA:1024,0 +DA:1025,0 +DA:1031,0 +DA:1032,0 +DA:1036,0 +DA:1037,0 +DA:1038,0 +DA:1039,0 +DA:1040,0 +DA:1042,0 +DA:1045,0 +DA:1047,0 +DA:1050,0 +DA:1051,0 +DA:1053,0 +DA:1054,0 +DA:1055,0 +DA:1056,0 +DA:1057,0 +DA:1058,0 +DA:1060,0 +DA:1061,0 +DA:1063,0 +DA:1064,0 +DA:1065,0 +DA:1066,0 +DA:1069,0 +DA:1072,0 +DA:1073,0 +DA:1075,0 +DA:1076,0 +DA:1077,0 +DA:1078,0 +DA:1079,0 +DA:1080,0 +DA:1082,0 +DA:1083,0 +DA:1085,0 +DA:1086,0 +DA:1087,0 +DA:1088,0 +DA:1094,0 +DA:1095,0 +DA:1099,0 +DA:1103,0 +DA:1104,0 +DA:1106,0 +DA:1107,0 +DA:1108,0 +DA:1109,0 +DA:1110,0 +DA:1111,0 +DA:1112,0 +DA:1113,0 +DA:1114,0 +DA:1118,0 +DA:1119,0 +DA:1120,0 +DA:1123,0 +DA:1127,0 +DA:1130,0 +DA:1135,0 +DA:1136,0 +DA:1137,0 +DA:1138,0 +DA:1139,0 +DA:1140,0 +DA:1141,0 +DA:1142,0 +DA:1143,0 +DA:1144,0 +DA:1145,0 +DA:1147,0 +DA:1148,0 +DA:1149,0 +DA:1150,0 +DA:1155,0 +DA:1156,0 +DA:1157,0 +DA:1158,0 +DA:1161,0 +DA:1164,0 +DA:1168,0 +DA:1173,0 +DA:1174,0 +DA:1175,0 +DA:1176,0 +DA:1177,0 +DA:1178,0 +DA:1179,0 +DA:1180,0 +DA:1181,0 +DA:1182,0 +DA:1183,0 +DA:1185,0 +DA:1186,0 +DA:1187,0 +DA:1188,0 +DA:1193,0 +DA:1194,0 +DA:1195,0 +DA:1196,0 +DA:1199,0 +DA:1202,0 +DA:1206,0 +DA:1209,0 +DA:1210,0 +DA:1211,0 +DA:1212,0 +DA:1213,0 +DA:1214,0 +DA:1234,0 +DA:1236,0 +DA:1237,0 +DA:1238,0 +DA:1239,0 +DA:1240,0 +DA:1244,0 +DA:1245,0 +DA:1246,0 +DA:1248,0 +DA:1249,0 +DA:1252,0 +DA:1253,0 +DA:1254,0 +DA:1276,0 +DA:1277,0 +DA:1278,0 +DA:1279,0 +DA:1280,0 +DA:1281,0 +DA:1282,0 +DA:1283,0 +DA:1284,0 +DA:1285,0 +DA:1286,0 +DA:1287,0 +DA:1288,0 +DA:1289,0 +DA:1290,0 +DA:1291,0 +DA:1292,0 +DA:1293,0 +DA:1294,0 +DA:1296,0 +DA:1297,0 +DA:1310,0 +DA:1316,0 +DA:1317,0 +DA:1320,0 +DA:1321,0 +DA:1322,0 +DA:1323,0 +DA:1324,0 +DA:1325,0 +DA:1326,0 +DA:1327,0 +DA:1328,0 +DA:1329,0 +DA:1333,0 +DA:1337,0 +DA:1338,0 +DA:1339,0 +DA:1352,0 +DA:1353,0 +DA:1356,0 +DA:1357,0 +DA:1359,0 +DA:1360,0 +DA:1373,0 +DA:1374,0 +DA:1375,0 +DA:1376,0 +DA:1377,0 +DA:1389,0 +DA:1390,0 +DA:1391,0 +DA:1404,0 +DA:1405,0 +DA:1406,0 +DA:1407,0 +DA:1420,0 +DA:1421,0 +DA:1422,0 +DA:1435,0 +DA:1436,0 +DA:1437,0 +DA:1455,0 +DA:1456,0 +DA:1457,0 +DA:1458,0 +DA:1459,0 +DA:1472,0 +DA:1473,0 +DA:1474,0 +DA:1488,0 +DA:1489,0 +DA:1490,0 +DA:1492,0 +DA:1493,0 +DA:1507,0 +DA:1508,0 +DA:1510,0 +DA:1511,0 +DA:1512,0 +DA:1524,384 +DA:1525,384 +DA:1526,384 +DA:1527,384 +DA:1529,0 +DA:1530,0 +DA:1531,0 +DA:1533,0 +DA:1534,0 +DA:1547,0 +DA:1548,0 +DA:1551,0 +DA:1552,0 +DA:1554,0 +DA:1555,0 +DA:1576,0 +DA:1577,0 +DA:1578,0 +DA:1579,0 +DA:1580,0 +DA:1586,0 +DA:1587,0 +DA:1590,0 +DA:1591,0 +DA:1594,0 +DA:1597,0 +DA:1598,0 +DA:1599,0 +DA:1602,0 +DA:1603,0 +DA:1604,0 +DA:1606,0 +DA:1608,0 +DA:1609,0 +DA:1610,0 +DA:1611,0 +DA:1612,0 +DA:1613,0 +DA:1615,0 +DA:1618,0 +DA:1619,0 +DA:1620,0 +DA:1621,0 +DA:1622,0 +DA:1623,0 +DA:1624,0 +DA:1625,0 +DA:1626,0 +DA:1627,0 +DA:1628,0 +DA:1629,0 +DA:1630,0 +DA:1631,0 +DA:1632,0 +DA:1633,0 +DA:1634,0 +DA:1635,0 +DA:1637,0 +DA:1639,0 +DA:1640,0 +DA:1641,0 +DA:1642,0 +DA:1643,0 +DA:1644,0 +DA:1646,0 +DA:1648,0 +DA:1650,0 +DA:1654,0 +DA:1655,0 +DA:1656,0 +DA:1657,0 +DA:1660,0 +DA:1661,0 +DA:1662,0 +DA:1664,0 +DA:1666,0 +DA:1667,0 +DA:1668,0 +DA:1669,0 +DA:1670,0 +DA:1671,0 +DA:1673,0 +DA:1676,0 +DA:1677,0 +DA:1678,0 +DA:1679,0 +DA:1680,0 +DA:1681,0 +DA:1682,0 +DA:1683,0 +DA:1684,0 +DA:1685,0 +DA:1686,0 +DA:1687,0 +DA:1688,0 +DA:1689,0 +DA:1690,0 +DA:1691,0 +DA:1692,0 +DA:1693,0 +DA:1695,0 +DA:1697,0 +DA:1698,0 +DA:1699,0 +DA:1700,0 +DA:1701,0 +DA:1702,0 +DA:1704,0 +DA:1706,0 +DA:1708,0 +DA:1710,0 +DA:1711,0 +DA:1712,0 +DA:1713,0 +DA:1714,0 +DA:1715,0 +DA:1716,0 +DA:1717,0 +DA:1718,0 +DA:1719,0 +DA:1720,0 +DA:1744,0 +DA:1748,0 +DA:1749,0 +DA:1750,0 +DA:1751,0 +DA:1752,0 +DA:1753,0 +DA:1754,0 +DA:1755,0 +DA:1756,0 +DA:1757,0 +DA:1777,0 +DA:1781,0 +DA:1784,0 +DA:1785,0 +DA:1786,0 +DA:1787,0 +DA:1789,0 +DA:1791,0 +DA:1792,0 +DA:1794,0 +DA:1795,0 +DA:1797,0 +DA:1798,0 +DA:1799,0 +DA:1800,0 +DA:1804,0 +DA:1805,0 +DA:1806,0 +DA:1807,0 +DA:1809,0 +DA:1810,0 +DA:1811,0 +DA:1813,0 +DA:1814,0 +DA:1815,0 +DA:1816,0 +DA:1817,0 +DA:1819,0 +DA:1820,0 +DA:1824,0 +DA:1825,0 +DA:1829,0 +DA:1832,0 +DA:1833,0 +DA:1834,0 +DA:1835,0 +DA:1837,0 +DA:1839,0 +DA:1840,0 +DA:1842,0 +DA:1843,0 +DA:1845,0 +DA:1846,0 +DA:1847,0 +DA:1848,0 +DA:1852,0 +DA:1853,0 +DA:1854,0 +DA:1855,0 +DA:1857,0 +DA:1858,0 +DA:1859,0 +DA:1861,0 +DA:1862,0 +DA:1863,0 +DA:1864,0 +DA:1865,0 +DA:1867,0 +DA:1868,0 +DA:1872,0 +DA:1873,0 +DA:1878,0 +DA:1879,0 +DA:1880,0 +DA:1881,0 +DA:1882,0 +DA:1883,0 +DA:1884,0 +DA:1902,0 +DA:1905,0 +DA:1906,0 +DA:1907,0 +DA:1908,0 +DA:1909,0 +DA:1921,384 +DA:1922,384 +DA:1923,384 +DA:1924,384 +DA:1926,0 +DA:1927,0 +DA:1928,0 +DA:1929,0 +DA:1930,0 +DA:1931,0 +DA:1932,0 +DA:1934,0 +DA:1935,0 +DA:1936,0 +DA:1937,0 +DA:1938,0 +DA:1939,0 +DA:1941,0 +DA:1942,0 +DA:1943,0 +DA:1944,0 +DA:1945,0 +DA:1946,0 +DA:1947,0 +DA:1948,0 +DA:1949,0 +DA:1950,0 +DA:1951,0 +DA:1952,0 +DA:1954,0 +DA:1955,0 +DA:1968,0 +DA:1970,0 +DA:1971,0 +DA:1975,0 +DA:1976,0 +DA:1977,0 +DA:1981,0 +DA:1982,0 +DA:1983,0 +DA:1984,0 +DA:1986,0 +DA:1987,0 +DA:1989,0 +DA:1991,0 +DA:1993,0 +DA:1994,0 +DA:2007,0 +DA:2008,0 +DA:2009,0 +DA:2022,0 +DA:2023,0 +DA:2024,0 +DA:2025,0 +DA:2026,0 +DA:2057,0 +DA:2058,0 +DA:2060,0 +DA:2062,0 +DA:2067,0 +DA:2068,0 +DA:2069,0 +DA:2070,0 +DA:2072,0 +DA:2094,0 +DA:2095,0 +DA:2115,0 +DA:2117,0 +DA:2119,0 +DA:2120,0 +DA:2122,0 +DA:2123,0 +DA:2126,0 +DA:2127,0 +DA:2129,0 +DA:2130,0 +DA:2133,0 +DA:2135,0 +DA:2136,0 +DA:2137,0 +DA:2139,0 +DA:2140,0 +DA:2141,0 +DA:2142,0 +DA:2144,0 +DA:2145,0 +DA:2146,0 +DA:2151,0 +DA:2152,0 +DA:2153,0 +DA:2154,0 +DA:2155,0 +DA:2159,0 +DA:2160,0 +DA:2161,0 +DA:2181,0 +DA:2183,0 +DA:2185,0 +DA:2186,0 +DA:2188,0 +DA:2189,0 +DA:2192,0 +DA:2193,0 +DA:2195,0 +DA:2196,0 +DA:2199,0 +DA:2201,0 +DA:2202,0 +DA:2203,0 +DA:2205,0 +DA:2206,0 +DA:2211,0 +DA:2212,0 +DA:2213,0 +DA:2214,0 +DA:2215,0 +DA:2219,0 +DA:2220,0 +DA:2243,0 +DA:2248,0 +DA:2250,0 +DA:2251,0 +DA:2252,0 +DA:2254,0 +DA:2255,0 +DA:2256,0 +DA:2258,0 +DA:2259,0 +DA:2269,0 +DA:2270,0 +DA:2271,0 +DA:2272,0 +DA:2275,0 +DA:2276,0 +DA:2277,0 +DA:2278,0 +DA:2281,0 +DA:2282,0 +DA:2284,0 +DA:2285,0 +DA:2292,0 +DA:2293,0 +DA:2294,0 +DA:2296,0 +DA:2301,0 +DA:2302,0 +DA:2303,0 +DA:2304,0 +DA:2305,0 +DA:2307,0 +DA:2308,0 +DA:2310,0 +DA:2311,0 +DA:2312,0 +DA:2313,0 +DA:2314,0 +DA:2319,0 +DA:2320,0 +DA:2321,0 +DA:2322,0 +DA:2324,0 +DA:2346,0 +DA:2348,0 +DA:2350,0 +DA:2351,0 +DA:2353,0 +DA:2354,0 +DA:2357,0 +DA:2358,0 +DA:2360,0 +DA:2361,0 +DA:2364,0 +DA:2366,0 +DA:2367,0 +DA:2368,0 +DA:2370,0 +DA:2371,0 +DA:2372,0 +DA:2377,0 +DA:2378,0 +DA:2379,0 +DA:2380,0 +DA:2381,0 +DA:2385,0 +DA:2386,0 +DA:2387,0 +DA:2409,0 +DA:2411,0 +DA:2413,0 +DA:2414,0 +DA:2416,0 +DA:2417,0 +DA:2420,0 +DA:2421,0 +DA:2423,0 +DA:2424,0 +DA:2427,0 +DA:2429,0 +DA:2430,0 +DA:2435,0 +DA:2436,0 +DA:2437,0 +DA:2438,0 +DA:2439,0 +DA:2443,0 +DA:2444,0 +DA:2446,0 +DA:2448,0 +DA:2471,0 +DA:2473,0 +DA:2477,0 +DA:2478,0 +DA:2480,0 +DA:2482,0 +DA:2483,0 +DA:2488,0 +DA:2489,0 +DA:2490,0 +DA:2491,0 +DA:2492,0 +DA:2499,0 +DA:2500,0 +DA:2501,0 +DA:2504,0 +DA:2505,0 +DA:2506,0 +DA:2507,0 +DA:2508,0 +DA:2509,0 +DA:2510,0 +DA:2513,0 +DA:2515,0 +DA:2536,0 +DA:2538,0 +DA:2539,0 +DA:2542,0 +DA:2543,0 +DA:2546,0 +DA:2547,0 +DA:2549,0 +DA:2550,0 +DA:2553,0 +DA:2555,0 +DA:2556,0 +DA:2557,0 +DA:2559,0 +DA:2560,0 +DA:2561,0 +DA:2562,0 +DA:2563,0 +DA:2564,0 +DA:2565,0 +DA:2567,0 +DA:2568,0 +DA:2569,0 +DA:2570,0 +DA:2571,0 +DA:2576,0 +DA:2577,0 +DA:2578,0 +DA:2579,0 +DA:2580,0 +DA:2585,0 +DA:2586,0 +DA:2587,0 +DA:2604,0 +DA:2613,0 +DA:2616,0 +DA:2619,0 +DA:2620,0 +DA:2621,0 +DA:2624,0 +DA:2625,0 +DA:2626,0 +DA:2627,0 +DA:2628,0 +DA:2629,0 +DA:2632,0 +DA:2637,0 +DA:2638,0 +DA:2642,0 +DA:2643,0 +DA:2644,0 +DA:2645,0 +DA:2646,0 +DA:2647,0 +DA:2648,0 +DA:2649,0 +DA:2651,0 +DA:2652,0 +DA:2654,0 +DA:2655,0 +DA:2656,0 +DA:2657,0 +DA:2659,0 +DA:2660,0 +DA:2661,0 +DA:2681,0 +DA:2683,0 +DA:2685,0 +DA:2688,0 +DA:2691,0 +DA:2692,0 +DA:2693,0 +DA:2697,0 +DA:2699,0 +DA:2720,0 +DA:2725,0 +DA:2727,0 +DA:2728,0 +DA:2731,0 +DA:2732,0 +DA:2733,0 +DA:2734,0 +DA:2740,0 +DA:2745,0 +DA:2747,0 +DA:2748,0 +DA:2749,0 +DA:2750,0 +DA:2751,0 +DA:2752,0 +DA:2753,0 +DA:2754,0 +DA:2755,0 +DA:2756,0 +DA:2757,0 +DA:2758,0 +DA:2759,0 +DA:2763,0 +DA:2765,0 +DA:2766,0 +DA:2787,0 +DA:2792,0 +DA:2794,0 +DA:2795,0 +DA:2798,0 +DA:2799,0 +DA:2800,0 +DA:2801,0 +DA:2804,0 +DA:2810,0 +DA:2812,0 +DA:2813,0 +DA:2814,0 +DA:2815,0 +DA:2816,0 +DA:2817,0 +DA:2818,0 +DA:2819,0 +DA:2820,0 +DA:2821,0 +DA:2822,0 +DA:2823,0 +DA:2824,0 +DA:2828,0 +DA:2830,0 +DA:2831,0 +DA:2853,0 +DA:2855,0 +DA:2856,0 +DA:2860,0 +DA:2861,0 +DA:2867,0 +DA:2869,0 +DA:2871,0 +DA:2872,0 +DA:2877,0 +DA:2878,0 +DA:2879,0 +DA:2880,0 +DA:2881,0 +DA:2886,0 +DA:2887,0 +DA:2888,0 +DA:2891,0 +DA:2892,0 +DA:2893,0 +DA:2896,0 +DA:2897,0 +DA:2900,0 +DA:2901,0 +DA:2902,0 +DA:2903,0 +DA:2904,0 +DA:2906,0 +DA:2908,0 +DA:2909,0 +DA:2910,0 +DA:2911,0 +DA:2912,0 +DA:2913,0 +DA:2914,0 +DA:2915,0 +DA:2918,0 +DA:2919,0 +DA:2920,0 +DA:2921,0 +DA:2922,0 +DA:2923,0 +DA:2924,0 +DA:2926,0 +DA:2927,0 +DA:2928,0 +DA:2929,0 +DA:2932,0 +DA:2933,0 +DA:2934,0 +DA:2935,0 +DA:2936,0 +DA:2938,0 +DA:2941,0 +DA:2942,0 +DA:2944,0 +DA:2945,0 +DA:2948,0 +DA:2949,0 +DA:2950,0 +DA:2951,0 +DA:2954,0 +DA:2955,0 +DA:2956,0 +DA:2957,0 +DA:2958,0 +DA:2959,0 +DA:2961,0 +DA:2962,0 +DA:2964,0 +DA:2965,0 +DA:2966,0 +DA:2967,0 +DA:2968,0 +DA:2972,0 +DA:2973,0 +DA:2975,0 +DA:2978,0 +DA:2979,0 +DA:2980,0 +DA:2981,0 +DA:2988,0 +DA:2989,0 +DA:2990,0 +DA:2991,0 +DA:2992,0 +DA:2993,0 +DA:2994,0 +DA:2995,0 +DA:2996,0 +DA:2999,0 +DA:3002,0 +DA:3003,0 +DA:3004,0 +DA:3005,0 +DA:3006,0 +DA:3007,0 +DA:3008,0 +DA:3012,0 +DA:3013,0 +DA:3014,0 +DA:3022,0 +DA:3023,0 +DA:3024,0 +DA:3025,0 +DA:3026,0 +DA:3027,0 +DA:3028,0 +DA:3029,0 +DA:3030,0 +DA:3035,0 +DA:3036,0 +DA:3041,0 +DA:3042,0 +DA:3043,0 +DA:3044,0 +DA:3045,0 +DA:3051,0 +DA:3052,0 +DA:3053,0 +DA:3054,0 +DA:3056,0 +DA:3057,0 +DA:3060,0 +DA:3061,0 +DA:3062,0 +DA:3063,0 +DA:3066,0 +DA:3067,0 +DA:3068,0 +DA:3069,0 +DA:3070,0 +DA:3071,0 +DA:3072,0 +DA:3073,0 +DA:3074,0 +DA:3075,0 +LF:1371 +LH:8 +end_of_record +TN: +SF:/home/jeremy/Dev/libCEED/interface/ceed-qfunction-register.c +FNL:0,30,44 +FNA:0,192,CeedQFunctionRegisterAll +FNF:1 +FNH:1 +DA:30,192 +DA:31,192 +DA:34,192 +DA:35,96 +DA:40,96 +DA:43,192 +LF:6 +LH:6 +end_of_record +TN: +SF:/home/jeremy/Dev/libCEED/interface/ceed-qfunction.c +FNL:0,1023,1026 +FNA:0,0,CeedQFunctionSetContextWritable +FNL:1,1036,1040 +FNA:1,192,CeedQFunctionSetUserFlopsEstimate +FNL:2,1052,1055 +FNA:2,0,CeedQFunctionSetNumViewTabs +FNL:3,106,112 +FNA:3,1152,CeedQFunctionFieldSet +FNL:4,1067,1070 +FNA:4,0,CeedQFunctionGetNumViewTabs +FNL:5,1082,1108 +FNA:5,0,CeedQFunctionView +FNL:6,1120,1123 +FNA:6,480,CeedQFunctionGetCeed +FNL:7,1134,1134 +FNA:7,0,CeedQFunctionReturnCeed +FNL:8,1150,1160 +FNA:8,3072,CeedQFunctionApply +FNL:9,1171,1202 +FNA:9,10368,CeedQFunctionDestroy +FNL:10,127,143 +FNA:10,0,CeedQFunctionFieldView +FNL:11,155,158 +FNA:11,0,CeedQFunctionView_Object +FNL:12,169,172 +FNA:12,0,CeedQFunctionDestroy_Object +FNL:13,184,187 +FNA:13,96,CeedQFunctionSetFortranStatus +FNL:14,207,210 +FNA:14,3072,CeedQFunctionGetVectorLength +FNL:15,223,227 +FNA:15,3072,CeedQFunctionGetNumArgs +FNL:16,240,247 +FNA:16,0,CeedQFunctionGetName +FNL:17,259,277 +FNA:17,768,CeedQFunctionGetKernelName +FNL:18,289,318 +FNA:18,768,CeedQFunctionGetSourcePath +FNL:19,338,353 +FNA:19,0,CeedQFunctionLoadSourceToBuffer +FNL:20,365,368 +FNA:20,3072,CeedQFunctionGetUserFunction +FNL:21,382,386 +FNA:21,6192,CeedQFunctionGetContext +FNL:22,400,417 +FNA:22,3072,CeedQFunctionGetContextData +FNL:23,429,444 +FNA:23,3072,CeedQFunctionRestoreContextData +FNL:24,457,472 +FNA:24,0,CeedQFunctionGetInnerContext +FNL:25,486,502 +FNA:25,0,CeedQFunctionGetInnerContextData +FNL:26,514,529 +FNA:26,0,CeedQFunctionRestoreInnerContextData +FNL:27,541,544 +FNA:27,384,CeedQFunctionIsIdentity +FNL:28,556,559 +FNA:28,2304,CeedQFunctionIsContextWritable +FNL:29,571,574 +FNA:29,3456,CeedQFunctionGetData +FNL:30,586,589 +FNA:30,384,CeedQFunctionSetData +FNL:31,601,604 +FNA:31,1152,CeedQFunctionIsImmutable +FNL:32,615,618 +FNA:32,9216,CeedQFunctionSetImmutable +FNL:33,629,632 +FNA:33,4608,CeedQFunctionReference +FNL:34,64,87 +FNA:34,1536,CeedQFunctionRegister +FNL:35,642,645 +FNA:35,0,CeedQFunctionGetFlopsEstimate +FNL:36,677,711 +FNA:36,720,CeedQFunctionCreateInterior +FNL:37,724,753 +FNA:37,192,CeedQFunctionCreateInteriorByName +FNL:38,772,787 +FNA:38,0,CeedQFunctionCreateIdentity +FNL:39,804,809 +FNA:39,4608,CeedQFunctionReferenceCopy +FNL:40,834,851 +FNA:40,768,CeedQFunctionAddInput +FNL:41,876,894 +FNA:41,384,CeedQFunctionAddOutput +FNL:42,911,919 +FNA:42,5760,CeedQFunctionGetFields +FNL:43,931,934 +FNA:43,4224,CeedQFunctionFieldGetName +FNL:44,946,949 +FNA:44,9792,CeedQFunctionFieldGetSize +FNL:45,961,964 +FNA:45,14784,CeedQFunctionFieldGetEvalMode +FNL:46,980,985 +FNA:46,1152,CeedQFunctionFieldGetData +FNL:47,997,1002 +FNA:47,144,CeedQFunctionSetContext +FNF:48 +FNH:33 +DA:64,1536 +DA:67,1536 +DA:69,1536 +DA:70,1536 +DA:72,1536 +DA:73,1536 +DA:74,1536 +DA:75,1536 +DA:76,1536 +DA:77,1536 +DA:78,1536 +DA:79,1536 +DA:80,1536 +DA:82,0 +DA:85,1536 +DA:86,1536 +DA:106,1152 +DA:107,1152 +DA:108,1152 +DA:109,1152 +DA:110,1152 +DA:111,1152 +DA:127,0 +DA:128,0 +DA:133,0 +DA:134,0 +DA:141,0 +DA:142,0 +DA:155,0 +DA:156,0 +DA:157,0 +DA:169,0 +DA:170,0 +DA:171,0 +DA:184,96 +DA:185,96 +DA:186,96 +DA:207,3072 +DA:208,3072 +DA:209,3072 +DA:223,3072 +DA:224,3072 +DA:225,3072 +DA:226,3072 +DA:240,0 +DA:241,0 +DA:242,0 +DA:244,0 +DA:246,0 +DA:259,768 +DA:260,768 +DA:263,96 +DA:264,96 +DA:265,96 +DA:267,96 +DA:268,96 +DA:270,0 +DA:272,96 +DA:275,768 +DA:276,768 +DA:289,768 +DA:290,768 +DA:295,96 +DA:296,96 +DA:298,96 +DA:299,96 +DA:300,96 +DA:301,48 +DA:303,48 +DA:305,96 +DA:307,96 +DA:309,96 +DA:310,96 +DA:311,96 +DA:313,96 +DA:316,768 +DA:317,768 +DA:338,0 +DA:341,0 +DA:342,0 +DA:343,0 +DA:345,0 +DA:347,0 +DA:348,0 +DA:349,0 +DA:350,0 +DA:352,0 +DA:365,3072 +DA:366,3072 +DA:367,3072 +DA:382,6192 +DA:383,6192 +DA:384,6192 +DA:385,6192 +DA:400,3072 +DA:404,3072 +DA:405,3072 +DA:406,1152 +DA:407,1152 +DA:408,1152 +DA:410,0 +DA:413,1920 +DA:415,3072 +DA:416,3072 +DA:429,3072 +DA:433,3072 +DA:434,3072 +DA:435,1152 +DA:436,1152 +DA:437,1152 +DA:439,0 +DA:442,3072 +DA:443,3072 +DA:457,0 +DA:460,0 +DA:461,0 +DA:462,0 +DA:464,0 +DA:465,0 +DA:466,0 +DA:468,0 +DA:470,0 +DA:471,0 +DA:486,0 +DA:490,0 +DA:491,0 +DA:492,0 +DA:493,0 +DA:494,0 +DA:496,0 +DA:499,0 +DA:501,0 +DA:514,0 +DA:518,0 +DA:519,0 +DA:520,0 +DA:521,0 +DA:522,0 +DA:524,0 +DA:527,0 +DA:528,0 +DA:541,384 +DA:542,384 +DA:543,384 +DA:556,2304 +DA:557,2304 +DA:558,2304 +DA:571,3456 +DA:572,3456 +DA:573,3456 +DA:586,384 +DA:587,384 +DA:588,384 +DA:601,1152 +DA:602,1152 +DA:603,1152 +DA:615,9216 +DA:616,9216 +DA:617,9216 +DA:629,4608 +DA:630,4608 +DA:631,4608 +DA:642,0 +DA:643,0 +DA:644,0 +DA:677,720 +DA:680,720 +DA:683,336 +DA:684,336 +DA:685,336 +DA:686,336 +DA:687,336 +DA:690,384 +DA:693,384 +DA:694,384 +DA:695,384 +DA:696,384 +DA:697,384 +DA:698,384 +DA:699,384 +DA:700,384 +DA:701,384 +DA:703,384 +DA:704,384 +DA:705,384 +DA:707,384 +DA:708,384 +DA:709,384 +DA:710,384 +DA:724,192 +DA:725,192 +DA:727,192 +DA:729,192 +DA:730,3264 +DA:732,3072 +DA:733,7296 +DA:735,3072 +DA:736,352 +DA:737,352 +DA:740,192 +DA:743,192 +DA:747,192 +DA:750,192 +DA:751,192 +DA:752,192 +DA:772,0 +DA:776,0 +DA:777,0 +DA:778,0 +DA:780,0 +DA:782,0 +DA:783,0 +DA:784,0 +DA:785,0 +DA:786,0 +DA:804,4608 +DA:805,4608 +DA:806,4608 +DA:807,4608 +DA:808,4608 +DA:834,768 +DA:837,768 +DA:838,768 +DA:839,768 +DA:840,1152 +DA:841,384 +DA:844,768 +DA:845,0 +DA:848,768 +DA:849,768 +DA:850,768 +DA:876,384 +DA:879,384 +DA:880,384 +DA:881,384 +DA:883,1152 +DA:884,768 +DA:887,384 +DA:888,0 +DA:891,384 +DA:892,384 +DA:893,384 +DA:911,5760 +DA:913,5760 +DA:914,5760 +DA:915,5760 +DA:916,5760 +DA:917,5760 +DA:918,5760 +DA:931,4224 +DA:932,4224 +DA:933,4224 +DA:946,9792 +DA:947,9792 +DA:948,9792 +DA:961,14784 +DA:962,14784 +DA:963,14784 +DA:980,1152 +DA:981,1152 +DA:982,1152 +DA:983,1152 +DA:984,1152 +DA:997,144 +DA:998,144 +DA:999,144 +DA:1000,144 +DA:1001,144 +DA:1023,0 +DA:1024,0 +DA:1025,0 +DA:1036,192 +DA:1037,192 +DA:1038,192 +DA:1039,192 +DA:1052,0 +DA:1053,0 +DA:1054,0 +DA:1067,0 +DA:1068,0 +DA:1069,0 +DA:1082,0 +DA:1083,0 +DA:1087,0 +DA:1089,0 +DA:1090,0 +DA:1091,0 +DA:1094,0 +DA:1095,0 +DA:1097,0 +DA:1098,0 +DA:1099,0 +DA:1102,0 +DA:1103,0 +DA:1104,0 +DA:1106,0 +DA:1107,0 +DA:1120,480 +DA:1121,480 +DA:1122,480 +DA:1134,0 +DA:1150,3072 +DA:1153,3072 +DA:1154,3072 +DA:1155,3072 +DA:1157,3072 +DA:1158,3072 +DA:1159,3072 +DA:1171,10368 +DA:1172,10368 +DA:1173,9984 +DA:1174,9984 +DA:1177,384 +DA:1178,384 +DA:1181,1152 +DA:1182,768 +DA:1183,768 +DA:1185,768 +DA:1186,384 +DA:1187,384 +DA:1189,384 +DA:1190,384 +DA:1193,384 +DA:1195,384 +DA:1196,384 +DA:1197,384 +DA:1198,384 +DA:1199,384 +DA:1200,384 +DA:1201,384 +LF:330 +LH:230 +end_of_record +TN: +SF:/home/jeremy/Dev/libCEED/interface/ceed-qfunctioncontext.c +FNL:0,111,128 +FNA:0,576,CeedQFunctionContextDestroyData +FNL:1,140,143 +FNA:1,0,CeedQFunctionContextView_Object +FNL:2,154,157 +FNA:2,0,CeedQFunctionContextDestroy_Object +FNL:3,177,180 +FNA:3,288,CeedQFunctionContextGetCeed +FNL:4,191,191 +FNA:4,0,CeedQFunctionContextReturnCeed +FNL:5,203,208 +FNA:5,1584,CeedQFunctionContextHasValidData +FNL:6,221,226 +FNA:6,0,CeedQFunctionContextHasBorrowedDataOfType +FNL:7,238,241 +FNA:7,0,CeedQFunctionContextGetState +FNL:8,253,256 +FNA:8,4680,CeedQFunctionContextGetBackendData +FNL:9,268,271 +FNA:9,288,CeedQFunctionContextSetBackendData +FNL:10,284,295 +FNA:10,0,CeedQFunctionContextGetFieldLabel +FNL:11,309,327 +FNA:11,0,CeedQFunctionContextSetGeneric +FNL:12,342,365 +FNA:12,0,CeedQFunctionContextGetGenericRead +FNL:13,36,42 +FNA:13,0,CeedQFunctionContextGetFieldIndex +FNL:14,379,388 +FNA:14,0,CeedQFunctionContextRestoreGenericRead +FNL:15,401,405 +FNA:15,0,CeedQFunctionContextSetDouble +FNL:16,419,423 +FNA:16,0,CeedQFunctionContextGetDoubleRead +FNL:17,436,440 +FNA:17,0,CeedQFunctionContextRestoreDoubleRead +FNL:18,453,457 +FNA:18,0,CeedQFunctionContextSetInt32 +FNL:19,471,475 +FNA:19,0,CeedQFunctionContextGetInt32Read +FNL:20,488,492 +FNA:20,0,CeedQFunctionContextRestoreInt32Read +FNL:21,505,509 +FNA:21,0,CeedQFunctionContextSetBoolean +FNL:22,523,527 +FNA:22,0,CeedQFunctionContextGetBooleanRead +FNL:23,540,544 +FNA:23,0,CeedQFunctionContextRestoreBooleanRead +FNL:24,557,561 +FNA:24,576,CeedQFunctionContextGetDataDestroy +FNL:25,572,575 +FNA:25,2496,CeedQFunctionContextReference +FNL:26,58,100 +FNA:26,0,CeedQFunctionContextRegisterGeneric +FNL:27,595,610 +FNA:27,540,CeedQFunctionContextCreate +FNL:28,627,632 +FNA:28,2352,CeedQFunctionContextReferenceCopy +FNL:29,650,660 +FNA:29,288,CeedQFunctionContextSetData +FNL:30,676,694 +FNA:30,0,CeedQFunctionContextTakeData +FNL:31,713,728 +FNA:31,1584,CeedQFunctionContextGetData +FNL:32,747,761 +FNA:32,0,CeedQFunctionContextGetDataRead +FNL:33,773,780 +FNA:33,1584,CeedQFunctionContextRestoreData +FNL:34,792,799 +FNA:34,0,CeedQFunctionContextRestoreDataRead +FNL:35,814,817 +FNA:35,0,CeedQFunctionContextRegisterDouble +FNL:36,832,835 +FNA:36,0,CeedQFunctionContextRegisterInt32 +FNL:37,850,853 +FNA:37,0,CeedQFunctionContextRegisterBoolean +FNL:38,866,870 +FNA:38,0,CeedQFunctionContextGetAllFieldLabels +FNL:39,886,894 +FNA:39,0,CeedContextFieldLabelGetDescription +FNL:40,906,909 +FNA:40,1476,CeedQFunctionContextGetContextSize +FNL:41,921,924 +FNA:41,0,CeedQFunctionContextSetNumViewTabs +FNL:42,936,939 +FNA:42,0,CeedQFunctionContextGetNumViewTabs +FNL:43,951,969 +FNA:43,0,CeedQFunctionContextView +FNL:44,982,987 +FNA:44,0,CeedQFunctionContextSetDataDestroy +FNL:45,998,1016 +FNA:45,9360,CeedQFunctionContextDestroy +FNF:46 +FNH:14 +DA:36,0 +DA:37,0 +DA:38,0 +DA:39,0 +DA:41,0 +DA:58,0 +DA:60,0 +DA:61,0 +DA:64,0 +DA:65,0 +DA:69,0 +DA:70,0 +DA:71,0 +DA:72,0 +DA:73,0 +DA:74,0 +DA:76,0 +DA:79,0 +DA:80,0 +DA:81,0 +DA:82,0 +DA:83,0 +DA:84,0 +DA:85,0 +DA:86,0 +DA:87,0 +DA:88,0 +DA:92,0 +DA:93,0 +DA:94,0 +DA:95,0 +DA:96,0 +DA:97,0 +DA:98,0 +DA:99,0 +DA:111,576 +DA:112,576 +DA:113,144 +DA:118,432 +DA:119,432 +DA:122,0 +DA:123,0 +DA:124,0 +DA:127,576 +DA:140,0 +DA:141,0 +DA:142,0 +DA:154,0 +DA:155,0 +DA:156,0 +DA:177,288 +DA:178,288 +DA:179,288 +DA:191,0 +DA:203,1584 +DA:204,1584 +DA:206,1584 +DA:207,1584 +DA:221,0 +DA:222,0 +DA:224,0 +DA:225,0 +DA:238,0 +DA:239,0 +DA:240,0 +DA:253,4680 +DA:254,4680 +DA:255,4680 +DA:268,288 +DA:269,288 +DA:270,288 +DA:284,0 +DA:287,0 +DA:289,0 +DA:290,0 +DA:292,0 +DA:294,0 +DA:309,0 +DA:314,0 +DA:318,0 +DA:319,0 +DA:320,0 +DA:321,0 +DA:322,0 +DA:323,0 +DA:324,0 +DA:326,0 +DA:342,0 +DA:347,0 +DA:351,0 +DA:352,0 +DA:353,0 +DA:354,0 +DA:355,0 +DA:356,0 +DA:357,0 +DA:358,0 +DA:359,0 +DA:360,0 +DA:361,0 +DA:362,0 +DA:364,0 +DA:379,0 +DA:382,0 +DA:386,0 +DA:387,0 +DA:401,0 +DA:402,0 +DA:403,0 +DA:404,0 +DA:419,0 +DA:420,0 +DA:421,0 +DA:422,0 +DA:436,0 +DA:437,0 +DA:438,0 +DA:439,0 +DA:453,0 +DA:454,0 +DA:455,0 +DA:456,0 +DA:471,0 +DA:472,0 +DA:473,0 +DA:474,0 +DA:488,0 +DA:489,0 +DA:490,0 +DA:491,0 +DA:505,0 +DA:506,0 +DA:507,0 +DA:508,0 +DA:523,0 +DA:524,0 +DA:525,0 +DA:526,0 +DA:540,0 +DA:541,0 +DA:542,0 +DA:543,0 +DA:557,576 +DA:558,576 +DA:559,576 +DA:560,576 +DA:572,2496 +DA:573,2496 +DA:574,2496 +DA:595,540 +DA:596,540 +DA:599,252 +DA:600,252 +DA:601,252 +DA:602,252 +DA:603,252 +DA:606,288 +DA:607,288 +DA:608,288 +DA:609,288 +DA:627,2352 +DA:628,2352 +DA:629,2352 +DA:630,2352 +DA:631,2352 +DA:650,288 +DA:651,288 +DA:652,288 +DA:655,288 +DA:656,288 +DA:657,288 +DA:658,288 +DA:659,288 +DA:676,0 +DA:677,0 +DA:678,0 +DA:680,0 +DA:681,0 +DA:683,0 +DA:684,0 +DA:687,0 +DA:688,0 +DA:691,0 +DA:692,0 +DA:693,0 +DA:713,1584 +DA:714,1584 +DA:716,1584 +DA:717,1584 +DA:719,1584 +DA:722,1584 +DA:723,1584 +DA:725,1584 +DA:726,1584 +DA:727,1584 +DA:747,0 +DA:748,0 +DA:750,0 +DA:752,0 +DA:755,0 +DA:756,0 +DA:758,0 +DA:759,0 +DA:760,0 +DA:773,1584 +DA:774,1584 +DA:776,1584 +DA:777,1584 +DA:778,1584 +DA:779,1584 +DA:792,0 +DA:793,0 +DA:795,0 +DA:796,0 +DA:797,0 +DA:798,0 +DA:814,0 +DA:816,0 +DA:832,0 +DA:834,0 +DA:850,0 +DA:852,0 +DA:866,0 +DA:867,0 +DA:868,0 +DA:869,0 +DA:886,0 +DA:888,0 +DA:889,0 +DA:890,0 +DA:891,0 +DA:892,0 +DA:893,0 +DA:906,1476 +DA:907,1476 +DA:908,1476 +DA:921,0 +DA:922,0 +DA:923,0 +DA:936,0 +DA:937,0 +DA:938,0 +DA:951,0 +DA:952,0 +DA:955,0 +DA:957,0 +DA:958,0 +DA:959,0 +DA:962,0 +DA:963,0 +DA:964,0 +DA:965,0 +DA:967,0 +DA:968,0 +DA:982,0 +DA:983,0 +DA:984,0 +DA:985,0 +DA:986,0 +DA:998,9360 +DA:999,9360 +DA:1000,9072 +DA:1001,9072 +DA:1003,288 +DA:1005,288 +DA:1006,288 +DA:1007,288 +DA:1008,0 +DA:1009,0 +DA:1010,0 +DA:1012,288 +DA:1013,288 +DA:1014,288 +DA:1015,288 +LF:274 +LH:81 +end_of_record +TN: +SF:/home/jeremy/Dev/libCEED/interface/ceed-register.c +FNL:0,30,44 +FNA:0,432,CeedRegisterAll +FNF:1 +FNH:1 +DA:30,432 +DA:31,432 +DA:34,432 +DA:35,192 +DA:40,192 +DA:43,432 +LF:6 +LH:6 +end_of_record +TN: +SF:/home/jeremy/Dev/libCEED/interface/ceed-tensor.c +FNL:0,124,136 +FNA:0,0,CeedTensorContractStridedApply +FNL:1,148,151 +FNA:1,192,CeedTensorContractGetCeed +FNL:2,162,162 +FNA:2,192,CeedTensorContractReturnCeed +FNL:3,174,177 +FNA:3,0,CeedTensorContractGetData +FNL:4,189,192 +FNA:4,0,CeedTensorContractSetData +FNL:5,203,206 +FNA:5,384,CeedTensorContractReference +FNL:6,223,228 +FNA:6,0,CeedTensorContractReferenceCopy +FNL:7,239,250 +FNA:7,768,CeedTensorContractDestroy +FNL:8,31,34 +FNA:8,0,CeedTensorContractDestroy_Object +FNL:9,54,69 +FNA:9,576,CeedTensorContractCreate +FNL:10,94,98 +FNA:10,8064,CeedTensorContractApply +FNF:11 +FNH:6 +DA:31,0 +DA:32,0 +DA:33,0 +DA:54,576 +DA:55,576 +DA:58,192 +DA:59,192 +DA:60,192 +DA:61,192 +DA:62,192 +DA:65,384 +DA:66,384 +DA:67,384 +DA:68,384 +DA:94,8064 +DA:96,8064 +DA:97,8064 +DA:124,0 +DA:126,0 +DA:127,0 +DA:128,0 +DA:131,0 +DA:132,0 +DA:135,0 +DA:148,192 +DA:149,192 +DA:150,192 +DA:162,192 +DA:174,0 +DA:175,0 +DA:176,0 +DA:189,0 +DA:190,0 +DA:191,0 +DA:203,384 +DA:204,384 +DA:205,384 +DA:223,0 +DA:224,0 +DA:225,0 +DA:226,0 +DA:227,0 +DA:239,768 +DA:240,768 +DA:241,384 +DA:242,384 +DA:244,384 +DA:245,192 +DA:247,384 +DA:248,384 +DA:249,384 +LF:51 +LH:30 +end_of_record +TN: +SF:/home/jeremy/Dev/libCEED/interface/ceed-vector.c +FNL:0,1002,1031 +FNA:0,0,CeedVectorReciprocal +FNL:1,1043,1046 +FNA:1,0,CeedVectorSetNumViewTabs +FNL:2,1058,1061 +FNA:2,0,CeedVectorGetNumViewTabs +FNL:3,1080,1111 +FNA:3,0,CeedVectorViewRange +FNL:4,1124,1130 +FNA:4,0,CeedVectorView +FNL:5,113,118 +FNA:5,0,CeedVectorHasBorrowedArrayOfType +FNL:6,1142,1145 +FNA:6,3648,CeedVectorGetCeed +FNL:7,1156,1156 +FNA:7,0,CeedVectorReturnCeed +FNL:8,1168,1171 +FNA:8,123936,CeedVectorGetLength +FNL:9,1182,1194 +FNA:9,20352,CeedVectorDestroy +FNL:10,130,133 +FNA:10,384,CeedVectorGetState +FNL:11,145,148 +FNA:11,74688,CeedVectorGetData +FNL:12,160,163 +FNA:12,3648,CeedVectorSetData +FNL:13,174,177 +FNA:13,1632,CeedVectorReference +FNL:14,198,216 +FNA:14,6120,CeedVectorCreate +FNL:15,233,238 +FNA:15,8928,CeedVectorReferenceCopy +FNL:16,250,285 +FNA:16,0,CeedVectorCopy +FNL:17,300,336 +FNA:17,0,CeedVectorCopyStrided +FNL:18,353,365 +FNA:18,6744,CeedVectorSetArray +FNL:19,377,395 +FNA:19,1056,CeedVectorSetValue +FNL:20,412,435 +FNA:20,0,CeedVectorSetValueStrided +FNL:21,450,468 +FNA:21,0,CeedVectorSyncArray +FNL:22,485,508 +FNA:22,0,CeedVectorTakeArray +FNL:23,52,55 +FNA:23,0,CeedVectorView_Object +FNL:24,527,549 +FNA:24,1824,CeedVectorGetArray +FNL:25,565,586 +FNA:25,14976,CeedVectorGetArrayRead +FNL:26,602,618 +FNA:26,11088,CeedVectorGetArrayWrite +FNL:27,630,639 +FNA:27,12912,CeedVectorRestoreArray +FNL:28,651,661 +FNA:28,14976,CeedVectorRestoreArrayRead +FNL:29,66,69 +FNA:29,0,CeedVectorDestroy_Object +FNL:30,677,723 +FNA:30,0,CeedVectorNorm +FNL:31,735,757 +FNA:31,0,CeedVectorScale +FNL:32,770,827 +FNA:32,0,CeedVectorAXPY +FNL:33,841,898 +FNA:33,0,CeedVectorAXPBY +FNL:34,89,100 +FNA:34,16800,CeedVectorHasValidArray +FNL:35,913,991 +FNA:35,0,CeedVectorPointwiseMult +FNF:36 +FNH:17 +DA:52,0 +DA:53,0 +DA:54,0 +DA:66,0 +DA:67,0 +DA:68,0 +DA:89,16800 +DA:92,16800 +DA:93,16800 +DA:94,16800 +DA:95,0 +DA:96,0 +DA:98,16800 +DA:99,16800 +DA:113,0 +DA:114,0 +DA:116,0 +DA:117,0 +DA:130,384 +DA:131,384 +DA:132,384 +DA:145,74688 +DA:146,74688 +DA:147,74688 +DA:160,3648 +DA:161,3648 +DA:162,3648 +DA:174,1632 +DA:175,1632 +DA:176,1632 +DA:198,6120 +DA:199,6120 +DA:200,6120 +DA:203,2472 +DA:204,2472 +DA:205,2472 +DA:206,2472 +DA:207,2472 +DA:210,3648 +DA:211,3648 +DA:212,3648 +DA:213,3648 +DA:214,3648 +DA:215,3648 +DA:233,8928 +DA:234,8928 +DA:235,8928 +DA:236,8928 +DA:237,8928 +DA:250,0 +DA:258,0 +DA:259,0 +DA:260,0 +DA:262,0 +DA:263,0 +DA:264,0 +DA:268,0 +DA:274,0 +DA:275,0 +DA:276,0 +DA:280,0 +DA:281,0 +DA:283,0 +DA:284,0 +DA:300,0 +DA:302,0 +DA:303,0 +DA:309,0 +DA:310,0 +DA:311,0 +DA:312,0 +DA:314,0 +DA:316,0 +DA:320,0 +DA:321,0 +DA:322,0 +DA:323,0 +DA:327,0 +DA:328,0 +DA:329,0 +DA:330,0 +DA:333,0 +DA:334,0 +DA:335,0 +DA:353,6744 +DA:356,6744 +DA:357,6744 +DA:359,6744 +DA:361,6744 +DA:362,6744 +DA:363,6744 +DA:364,6744 +DA:377,1056 +DA:378,1056 +DA:380,1056 +DA:382,1056 +DA:383,144 +DA:384,144 +DA:389,912 +DA:390,912 +DA:391,274224 +DA:392,912 +DA:394,1056 +DA:412,0 +DA:415,0 +DA:417,0 +DA:418,0 +DA:419,0 +DA:422,0 +DA:423,0 +DA:424,0 +DA:428,0 +DA:429,0 +DA:430,0 +DA:431,0 +DA:432,0 +DA:434,0 +DA:450,0 +DA:453,0 +DA:456,0 +DA:457,0 +DA:459,0 +DA:460,0 +DA:464,0 +DA:465,0 +DA:467,0 +DA:485,0 +DA:487,0 +DA:489,0 +DA:490,0 +DA:492,0 +DA:493,0 +DA:494,0 +DA:496,0 +DA:497,0 +DA:500,0 +DA:501,0 +DA:504,0 +DA:506,0 +DA:507,0 +DA:527,1824 +DA:530,1824 +DA:531,1824 +DA:533,1824 +DA:535,1824 +DA:536,1824 +DA:537,1824 +DA:539,1824 +DA:540,1824 +DA:543,1824 +DA:545,0 +DA:547,1824 +DA:548,1824 +DA:565,14976 +DA:568,14976 +DA:569,14976 +DA:572,14976 +DA:573,14976 +DA:574,14976 +DA:576,14976 +DA:577,14976 +DA:580,14976 +DA:582,0 +DA:584,14976 +DA:585,14976 +DA:602,11088 +DA:605,11088 +DA:606,11088 +DA:608,11088 +DA:610,11088 +DA:611,11088 +DA:612,11088 +DA:614,0 +DA:616,11088 +DA:617,11088 +DA:630,12912 +DA:633,12912 +DA:634,12912 +DA:635,12912 +DA:636,12912 +DA:637,12912 +DA:638,12912 +DA:651,14976 +DA:654,14976 +DA:656,14976 +DA:657,14976 +DA:658,14976 +DA:659,14976 +DA:660,14976 +DA:677,0 +DA:678,0 +DA:681,0 +DA:682,0 +DA:685,0 +DA:686,0 +DA:687,0 +DA:688,0 +DA:692,0 +DA:693,0 +DA:694,0 +DA:698,0 +DA:699,0 +DA:701,0 +DA:702,0 +DA:703,0 +DA:704,0 +DA:705,0 +DA:707,0 +DA:708,0 +DA:709,0 +DA:710,0 +DA:712,0 +DA:713,0 +DA:714,0 +DA:715,0 +DA:716,0 +DA:719,0 +DA:721,0 +DA:722,0 +DA:735,0 +DA:736,0 +DA:738,0 +DA:740,0 +DA:741,0 +DA:745,0 +DA:746,0 +DA:749,0 +DA:752,0 +DA:753,0 +DA:754,0 +DA:755,0 +DA:756,0 +DA:770,0 +DA:771,0 +DA:773,0 +DA:774,0 +DA:776,0 +DA:777,0 +DA:778,0 +DA:782,0 +DA:784,0 +DA:785,0 +DA:787,0 +DA:788,0 +DA:794,0 +DA:795,0 +DA:796,0 +DA:797,0 +DA:798,0 +DA:799,0 +DA:800,0 +DA:802,0 +DA:803,0 +DA:807,0 +DA:810,0 +DA:811,0 +DA:812,0 +DA:816,0 +DA:817,0 +DA:819,0 +DA:820,0 +DA:822,0 +DA:824,0 +DA:825,0 +DA:826,0 +DA:841,0 +DA:842,0 +DA:844,0 +DA:845,0 +DA:847,0 +DA:848,0 +DA:849,0 +DA:853,0 +DA:855,0 +DA:856,0 +DA:858,0 +DA:859,0 +DA:865,0 +DA:866,0 +DA:867,0 +DA:868,0 +DA:869,0 +DA:870,0 +DA:871,0 +DA:873,0 +DA:874,0 +DA:878,0 +DA:881,0 +DA:882,0 +DA:883,0 +DA:887,0 +DA:888,0 +DA:890,0 +DA:891,0 +DA:893,0 +DA:895,0 +DA:896,0 +DA:897,0 +DA:913,0 +DA:914,0 +DA:915,0 +DA:916,0 +DA:919,0 +DA:920,0 +DA:921,0 +DA:922,0 +DA:930,0 +DA:931,0 +DA:932,0 +DA:933,0 +DA:934,0 +DA:935,0 +DA:936,0 +DA:937,0 +DA:938,0 +DA:939,0 +DA:941,0 +DA:942,0 +DA:943,0 +DA:946,0 +DA:947,0 +DA:949,0 +DA:950,0 +DA:954,0 +DA:957,0 +DA:958,0 +DA:959,0 +DA:963,0 +DA:964,0 +DA:966,0 +DA:968,0 +DA:969,0 +DA:971,0 +DA:973,0 +DA:974,0 +DA:975,0 +DA:976,0 +DA:977,0 +DA:978,0 +DA:981,0 +DA:982,0 +DA:983,0 +DA:985,0 +DA:987,0 +DA:988,0 +DA:989,0 +DA:990,0 +DA:1002,0 +DA:1003,0 +DA:1007,0 +DA:1008,0 +DA:1012,0 +DA:1015,0 +DA:1016,0 +DA:1019,0 +DA:1020,0 +DA:1021,0 +DA:1024,0 +DA:1025,0 +DA:1026,0 +DA:1029,0 +DA:1030,0 +DA:1043,0 +DA:1044,0 +DA:1045,0 +DA:1058,0 +DA:1059,0 +DA:1060,0 +DA:1080,0 +DA:1082,0 +DA:1086,0 +DA:1089,0 +DA:1091,0 +DA:1092,0 +DA:1093,0 +DA:1096,0 +DA:1097,0 +DA:1098,0 +DA:1099,0 +DA:1101,0 +DA:1102,0 +DA:1104,0 +DA:1105,0 +DA:1106,0 +DA:1107,0 +DA:1108,0 +DA:1109,0 +DA:1110,0 +DA:1124,0 +DA:1127,0 +DA:1128,0 +DA:1129,0 +DA:1142,3648 +DA:1143,3648 +DA:1144,3648 +DA:1156,0 +DA:1168,123936 +DA:1169,123936 +DA:1170,123936 +DA:1182,20352 +DA:1183,20352 +DA:1184,16704 +DA:1185,16704 +DA:1187,3648 +DA:1188,3648 +DA:1190,3648 +DA:1191,3648 +DA:1192,3648 +DA:1193,3648 +LF:409 +LH:118 +end_of_record +TN: +SF:/home/jeremy/Dev/libCEED/interface/ceed.c +FNL:0,1013,1022 +FNA:0,0,CeedGetRustSourceRoots +FNL:1,1034,1042 +FNA:1,48,CeedRestoreJitSourceRoots +FNL:2,1054,1062 +FNA:2,0,CeedRestoreRustSourceRoots +FNL:3,1077,1086 +FNA:3,0,CeedGetJitDefines +FNL:4,1098,1106 +FNA:4,0,CeedRestoreJitDefines +FNL:5,1171,1377 +FNA:5,432,CeedInit +FNL:6,123,139 +FNA:6,4416,CeedRegisterImpl +FNL:7,1389,1402 +FNA:7,0,CeedSetStream +FNL:8,1419,1424 +FNA:8,24420,CeedReferenceCopy +FNL:9,1436,1439 +FNA:9,648,CeedGetResource +FNL:10,1451,1466 +FNA:10,0,CeedGetPreferredMemType +FNL:11,1478,1481 +FNA:11,0,CeedIsDeterministic +FNL:12,1493,1512 +FNA:12,432,CeedAddJitSourceRoot +FNL:13,150,153 +FNA:13,0,CeedWorkVectorsCreate +FNL:14,1524,1545 +FNA:14,0,CeedAddRustSourceRoot +FNL:15,1557,1576 +FNA:15,0,CeedAddJitDefine +FNL:16,1588,1591 +FNA:16,0,CeedSetNumViewTabs +FNL:17,1603,1606 +FNA:17,0,CeedGetNumViewTabs +FNL:18,1618,1638 +FNA:18,0,CeedView +FNL:19,164,179 +FNA:19,300,CeedWorkVectorsDestroy +FNL:20,1649,1693 +FNA:20,48480,CeedDestroy +FNL:21,1711,1732 +FNA:21,0,CeedErrorImpl +FNL:22,1800,1807 +FNA:22,0,CeedErrorExit +FNL:23,1819,1824 +FNA:23,0,CeedSetErrorHandler +FNL:24,1838,1842 +FNA:24,0,CeedGetErrorMessage +FNL:25,1856,1861 +FNA:25,0,CeedResetErrorMessage +FNL:26,1882,1888 +FNA:26,0,CeedGetVersion +FNL:27,1899,1902 +FNA:27,0,CeedGetScalarType +FNL:28,191,194 +FNA:28,0,CeedView_Object +FNL:29,205,208 +FNA:29,0,CeedDestroy_Object +FNL:30,282,286 +FNA:30,1812,CeedMallocArray +FNL:31,303,307 +FNA:31,43296,CeedCallocArray +FNL:32,324,328 +FNA:32,1776,CeedReallocArray +FNL:33,344,349 +FNA:33,2928,CeedStringAllocCopy +FNL:34,360,364 +FNA:34,58680,CeedFree +FNL:35,380,407 +FNA:35,5400,CeedSetHostGenericArray +FNL:36,422,426 +FNA:36,0,CeedSetHostBoolArray +FNL:37,441,445 +FNA:37,0,CeedSetHostCeedInt8Array +FNL:38,460,464 +FNA:38,648,CeedSetHostCeedIntArray +FNL:39,479,483 +FNA:39,4752,CeedSetHostCeedScalarArray +FNL:40,498,502 +FNA:40,1536,CeedRegister +FNL:41,514,517 +FNA:41,0,CeedIsDebug +FNL:42,533,540 +FNA:42,0,CeedGetResourceRoot +FNL:43,552,560 +FNA:43,4032,CeedGetParent +FNL:44,572,576 +FNA:44,4428,CeedGetDelegate +FNL:45,591,595 +FNA:45,240,CeedSetDelegate +FNL:46,608,621 +FNA:46,4428,CeedGetObjectDelegate +FNL:47,638,656 +FNA:47,0,CeedSetObjectDelegate +FNL:48,668,678 +FNA:48,0,CeedGetOperatorFallbackCeed +FNL:49,692,696 +FNA:49,0,CeedSetOperatorFallbackCeed +FNL:50,708,711 +FNA:50,384,CeedSetDeterministic +FNL:51,730,752 +FNA:51,63624,CeedSetBackendFunctionImpl +FNL:52,764,767 +FNA:52,672,CeedGetData +FNL:53,779,782 +FNA:53,96,CeedSetData +FNL:54,793,796 +FNA:54,24420,CeedReference +FNL:55,808,830 +FNA:55,0,CeedGetWorkVectorMemoryUsage +FNL:56,842,874 +FNA:56,0,CeedClearWorkVectors +FNL:57,889,938 +FNA:57,0,CeedGetWorkVector +FNL:58,95,98 +FNA:58,0,CeedRequestWait +FNL:59,950,974 +FNA:59,0,CeedRestoreWorkVector +FNL:60,989,998 +FNA:60,48,CeedGetJitSourceRoots +FNF:61 +FNH:27 +DA:95,0 +DA:96,0 +DA:97,0 +DA:123,4416 +DA:124,4416 +DA:127,4416 +DA:128,4416 +DA:129,4416 +DA:130,4416 +DA:131,4416 +DA:132,4416 +DA:134,0 +DA:137,4416 +DA:138,4416 +DA:150,0 +DA:151,0 +DA:152,0 +DA:164,300 +DA:165,300 +DA:166,0 +DA:167,0 +DA:169,0 +DA:170,0 +DA:171,0 +DA:173,0 +DA:175,0 +DA:176,0 +DA:177,0 +DA:178,0 +DA:191,0 +DA:192,0 +DA:193,0 +DA:205,0 +DA:206,0 +DA:207,0 +DA:282,1812 +DA:283,1812 +DA:284,1812 +DA:285,1812 +DA:303,43296 +DA:304,43296 +DA:305,43296 +DA:306,43296 +DA:324,1776 +DA:325,1776 +DA:326,1776 +DA:327,1776 +DA:344,2928 +DA:345,2928 +DA:346,2928 +DA:347,2928 +DA:348,2928 +DA:360,58680 +DA:361,58680 +DA:362,58680 +DA:363,58680 +DA:380,5400 +DA:382,5400 +DA:383,2112 +DA:384,2112 +DA:385,2112 +DA:386,0 +DA:388,2112 +DA:389,2112 +DA:392,2112 +DA:393,2112 +DA:394,360 +DA:395,360 +DA:396,360 +DA:397,360 +DA:398,360 +DA:399,360 +DA:400,2928 +DA:401,2928 +DA:402,2928 +DA:403,2928 +DA:404,2928 +DA:406,5400 +DA:422,0 +DA:424,0 +DA:425,0 +DA:441,0 +DA:443,0 +DA:444,0 +DA:460,648 +DA:462,648 +DA:463,648 +DA:479,4752 +DA:481,4752 +DA:482,4752 +DA:498,1536 +DA:499,1536 +DA:500,1536 +DA:501,1536 +DA:514,0 +DA:515,0 +DA:516,0 +DA:533,0 +DA:534,0 +DA:535,0 +DA:537,0 +DA:538,0 +DA:539,0 +DA:552,4032 +DA:553,4032 +DA:554,1704 +DA:555,1704 +DA:557,2328 +DA:558,2328 +DA:559,2328 +DA:572,4428 +DA:573,4428 +DA:574,4428 +DA:575,4428 +DA:591,240 +DA:592,240 +DA:593,240 +DA:594,240 +DA:608,4428 +DA:610,4428 +DA:611,0 +DA:612,0 +DA:613,0 +DA:614,0 +DA:619,4428 +DA:620,4428 +DA:638,0 +DA:639,0 +DA:642,0 +DA:643,0 +DA:645,0 +DA:647,0 +DA:650,0 +DA:651,0 +DA:654,0 +DA:655,0 +DA:668,0 +DA:669,0 +DA:670,0 +DA:671,0 +DA:675,0 +DA:676,0 +DA:677,0 +DA:692,0 +DA:693,0 +DA:694,0 +DA:695,0 +DA:708,384 +DA:709,384 +DA:710,384 +DA:730,63624 +DA:731,63624 +DA:734,63624 +DA:735,63624 +DA:736,63624 +DA:739,2096256 +DA:740,2096256 +DA:741,63624 +DA:742,63624 +DA:744,63624 +DA:745,63624 +DA:764,672 +DA:765,672 +DA:766,672 +DA:779,96 +DA:780,96 +DA:781,96 +DA:793,24420 +DA:794,24420 +DA:795,24420 +DA:808,0 +DA:809,0 +DA:812,0 +DA:813,0 +DA:814,0 +DA:815,0 +DA:816,0 +DA:818,0 +DA:819,0 +DA:820,0 +DA:822,0 +DA:823,0 +DA:825,0 +DA:826,0 +DA:829,0 +DA:842,0 +DA:843,0 +DA:846,0 +DA:847,0 +DA:848,0 +DA:849,0 +DA:850,0 +DA:852,0 +DA:853,0 +DA:854,0 +DA:856,0 +DA:857,0 +DA:859,0 +DA:860,0 +DA:861,0 +DA:863,0 +DA:864,0 +DA:865,0 +DA:866,0 +DA:867,0 +DA:868,0 +DA:869,0 +DA:873,0 +DA:889,0 +DA:890,0 +DA:893,0 +DA:896,0 +DA:897,0 +DA:898,0 +DA:899,0 +DA:900,0 +DA:903,0 +DA:906,0 +DA:907,0 +DA:910,0 +DA:911,0 +DA:915,0 +DA:916,0 +DA:917,0 +DA:918,0 +DA:919,0 +DA:920,0 +DA:921,0 +DA:922,0 +DA:923,0 +DA:925,0 +DA:926,0 +DA:928,0 +DA:929,0 +DA:932,0 +DA:933,0 +DA:934,0 +DA:936,0 +DA:937,0 +DA:950,0 +DA:951,0 +DA:954,0 +DA:955,0 +DA:956,0 +DA:957,0 +DA:958,0 +DA:961,0 +DA:962,0 +DA:963,0 +DA:964,0 +DA:965,0 +DA:967,0 +DA:968,0 +DA:989,48 +DA:992,48 +DA:993,48 +DA:994,48 +DA:995,48 +DA:996,48 +DA:997,48 +DA:1013,0 +DA:1016,0 +DA:1017,0 +DA:1018,0 +DA:1019,0 +DA:1020,0 +DA:1021,0 +DA:1034,48 +DA:1037,48 +DA:1038,48 +DA:1039,48 +DA:1040,48 +DA:1041,48 +DA:1054,0 +DA:1057,0 +DA:1058,0 +DA:1059,0 +DA:1060,0 +DA:1061,0 +DA:1077,0 +DA:1080,0 +DA:1081,0 +DA:1082,0 +DA:1083,0 +DA:1084,0 +DA:1085,0 +DA:1098,0 +DA:1101,0 +DA:1102,0 +DA:1103,0 +DA:1104,0 +DA:1105,0 +DA:1171,432 +DA:1172,432 +DA:1175,432 +DA:1176,432 +DA:1179,432 +DA:1180,432 +DA:1181,432 +DA:1182,432 +DA:1183,0 +DA:1185,0 +DA:1186,0 +DA:1188,0 +DA:1190,0 +DA:1191,0 +DA:1193,432 +DA:1197,432 +DA:1198,9504 +DA:1199,10368 +DA:1200,9936 +DA:1201,9936 +DA:1202,65520 +DA:1203,9936 +DA:1204,9936 +DA:1205,864 +DA:1206,864 +DA:1207,864 +DA:1211,432 +DA:1249,432 +DA:1250,432 +DA:1251,432 +DA:1252,432 +DA:1253,432 +DA:1254,432 +DA:1255,432 +DA:1256,0 +DA:1257,0 +DA:1258,432 +DA:1259,432 +DA:1262,432 +DA:1351,432 +DA:1352,432 +DA:1355,432 +DA:1358,432 +DA:1362,432 +DA:1366,432 +DA:1368,432 +DA:1369,0 +DA:1371,432 +DA:1375,432 +DA:1376,432 +DA:1389,0 +DA:1390,0 +DA:1391,0 +DA:1392,0 +DA:1395,0 +DA:1397,0 +DA:1398,0 +DA:1399,0 +DA:1401,0 +DA:1419,24420 +DA:1420,24420 +DA:1421,24420 +DA:1422,24420 +DA:1423,24420 +DA:1436,648 +DA:1437,648 +DA:1438,648 +DA:1451,0 +DA:1452,0 +DA:1453,0 +DA:1456,0 +DA:1458,0 +DA:1459,0 +DA:1461,0 +DA:1463,0 +DA:1465,0 +DA:1478,0 +DA:1479,0 +DA:1480,0 +DA:1493,432 +DA:1496,432 +DA:1497,432 +DA:1499,432 +DA:1500,432 +DA:1502,432 +DA:1503,432 +DA:1504,432 +DA:1505,432 +DA:1507,432 +DA:1508,432 +DA:1509,432 +DA:1510,432 +DA:1511,432 +DA:1524,0 +DA:1527,0 +DA:1528,0 +DA:1530,0 +DA:1531,0 +DA:1533,0 +DA:1534,0 +DA:1535,0 +DA:1536,0 +DA:1538,0 +DA:1539,0 +DA:1540,0 +DA:1541,0 +DA:1542,0 +DA:1543,0 +DA:1544,0 +DA:1557,0 +DA:1560,0 +DA:1561,0 +DA:1563,0 +DA:1564,0 +DA:1566,0 +DA:1567,0 +DA:1568,0 +DA:1569,0 +DA:1571,0 +DA:1572,0 +DA:1573,0 +DA:1574,0 +DA:1575,0 +DA:1588,0 +DA:1589,0 +DA:1590,0 +DA:1603,0 +DA:1604,0 +DA:1605,0 +DA:1618,0 +DA:1619,0 +DA:1622,0 +DA:1625,0 +DA:1627,0 +DA:1628,0 +DA:1629,0 +DA:1631,0 +DA:1635,0 +DA:1636,0 +DA:1637,0 +DA:1649,48480 +DA:1650,48480 +DA:1651,48180 +DA:1652,48180 +DA:1655,300 +DA:1657,300 +DA:1659,300 +DA:1661,300 +DA:1662,0 +DA:1663,0 +DA:1664,0 +DA:1666,0 +DA:1669,300 +DA:1671,600 +DA:1672,300 +DA:1674,300 +DA:1676,300 +DA:1677,0 +DA:1679,300 +DA:1681,300 +DA:1682,0 +DA:1684,300 +DA:1686,300 +DA:1687,300 +DA:1688,300 +DA:1689,300 +DA:1690,300 +DA:1691,300 +DA:1692,300 +DA:1711,0 +DA:1715,0 +DA:1716,0 +DA:1717,0 +DA:1800,0 +DA:1801,0 +DA:1803,0 +DA:1804,0 +DA:1805,0 +DA:1819,0 +DA:1820,0 +DA:1821,0 +DA:1822,0 +DA:1823,0 +DA:1838,0 +DA:1839,0 +DA:1840,0 +DA:1841,0 +DA:1856,0 +DA:1857,0 +DA:1858,0 +DA:1859,0 +DA:1860,0 +DA:1882,0 +DA:1883,0 +DA:1884,0 +DA:1885,0 +DA:1886,0 +DA:1887,0 +DA:1899,0 +DA:1900,0 +DA:1901,0 +LF:493 +LH:205 +end_of_record +TN: +SF:/usr/include/valgrind/valgrind.h +FNL:0,7293,7322 +FNA:0,0,VALGRIND_PRINTF +FNL:1,7332,7361 +FNA:1,0,VALGRIND_PRINTF_BACKTRACE +FNF:2 +FNH:0 +DA:7293,0 +DA:7305,0 +DA:7313,0 +DA:7319,0 +DA:7320,0 +DA:7332,0 +DA:7344,0 +DA:7352,0 +DA:7358,0 +DA:7359,0 +LF:10 +LH:0 +end_of_record diff --git a/doc/img/libCEEDBackends.svg b/doc/img/libCEEDBackends.svg index cff3b2527a..d8e96bb13b 100644 --- a/doc/img/libCEEDBackends.svg +++ b/doc/img/libCEEDBackends.svg @@ -1,1862 +1,1128 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/doc/img/libCEEDBackends.tex b/doc/img/libCEEDBackends.tex new file mode 100644 index 0000000000..a1b9d28652 --- /dev/null +++ b/doc/img/libCEEDBackends.tex @@ -0,0 +1,192 @@ +\documentclass[tikz]{standalone} +\usepackage{tikz} +\usepackage{pgfplots} +\usepackage{pgfmath} +\usepackage{libertine} +\usetikzlibrary{calc} + +\renewcommand{\familydefault}{\sfdefault} + +\definecolor{ceed@blue}{RGB}{100,150,230} +\definecolor{ceed@green}{RGB}{75,200,75} +\definecolor{ceed@red}{RGB}{200,75,75} +\definecolor{ceed@orange}{RGB}{252,186,3} + +\pgfplotsset{compat=1.18} + +\begin{document} + +\begin{tikzpicture} + +\begin{scope}[shift={(0,-0.6)}] + \node at (1.0,6.1) {\large Application}; + + % PETSc + \draw[ + top color=ceed@red!10!white, + bottom color=ceed@red!40!white, + ceed@red!60!black, + ] (0.0,3.0) rectangle ++(1.6,0.8) + node[pos=.5,align=center,color=black] {PETSc}; + \draw[-stealth, line width=0.5pt] (1.6, 3.0+0.4) -- ++(1.6,-1.2-0.4); + + % Ratel + \draw[ + top color=ceed@red!10!white, + bottom color=ceed@red!40!white, + ceed@red!60!black, + ] (0.0,1.8) rectangle ++(1.6,0.8) + node[pos=.5,align=center,color=black] {Ratel}; + \draw[-stealth, line width=0.5pt] (1.6, 1.8+0.4) -- ++(1.6,0.0-0.55); + + % HONEE + \draw[ + top color=ceed@red!10!white, + bottom color=ceed@red!40!white, + ceed@red!60!black, + ] (0.0,0.6) rectangle ++(1.6,0.8) + node[pos=.5,align=center,color=black] {HONEE}; + \draw[-stealth, line width=0.5pt] (1.6, 0.6+0.4) -- ++(1.6,1.2-0.65); + + % MFEM + \draw[ + top color=ceed@red!10!white, + bottom color=ceed@red!40!white, + ceed@red!60!black, + ] (0.0,-0.6) rectangle ++(1.6,0.8) + node[pos=.5,align=center,color=black] {MFEM}; + \draw[-stealth, line width=0.5pt] (1.6, -0.6+0.4) -- ++(1.6,2.4-0.8); +\end{scope} + +\begin{scope}[shift={(3.2,0)}] + \begin{scope}[shift={(0,-0.6)}] + \node at (0.8,6.1) {\large Library}; + \draw[ + top color=ceed@blue!10!white, + bottom color=ceed@blue!40!white, + ceed@blue!60!black, + ] (0.0,1.2) rectangle ++(1.6,0.8) + node[pos=.5,align=center,color=black] {libCEED}; + + \draw[-stealth, line width=0.5pt] (1.6, 1.6) -- ++(1.6,3.6); + \draw[-stealth, line width=0.5pt] (1.6, 1.6) -- ++(1.6,2.4); + \draw[-stealth, line width=0.5pt] (1.6, 1.6) -- ++(1.6,1.2); + \draw[-stealth, line width=0.5pt] (1.6, 1.6) -- ++(1.6,0.0); + \draw[-stealth, line width=0.5pt] (1.6, 1.6) -- ++(1.6,-1.2); + \draw[-stealth, line width=0.5pt] (1.6, 1.6) -- ++(1.6,-2.4); + \draw[-stealth, line width=0.5pt] (1.6, 1.6) -- ++(1.6,-3.6); + \end{scope} +\end{scope} + +\begin{scope}[shift={(6.4,0)}] + \begin{scope}[shift={(0,-0.6)}] + \node at (0.95,6.1) {\large Backends}; + + % C + \draw[ + top color=black!5!white, + bottom color=black!20!white, + black!80!white, + ] (0.0,4.8) rectangle ++(2.0,0.8) + node[pos=.5,align=center,color=black] {Pure C}; + \draw[-stealth, line width=0.5pt] (2.0, 5.2) -- ++(1.2,-1.2+0.15); + + % AVX + \draw[ + top color=black!5!white, + bottom color=black!20!white, + black!80!white, + ] (0.0,3.6) rectangle ++(2.0,0.8) + node[pos=.5,align=center,color=black] {AVX}; + \draw[-stealth, line width=0.5pt] (2.0, 4.0) -- ++(1.2,+0.0+0.025); + + % LIBXSMM + \draw[ + top color=black!5!white, + bottom color=black!20!white, + black!80!white, + ] (0.0,2.4) rectangle ++(2.0,0.8) + node[pos=.5,align=center,color=black] {LIBXSMM}; + \draw[-stealth, line width=0.5pt] (2.0, 2.8) -- ++(1.2,1.1-0.025); + + % CUDA + \draw[ + top color=black!5!white, + bottom color=black!20!white, + black!80!white, + ] (0.0,1.2) rectangle ++(2.0,0.8) + node[pos=.5,align=center,color=black] {CUDA}; + \draw[-stealth, line width=0.5pt] (2.0, 1.6) -- ++(1.2,0.0+0.025); + + % HIP + \draw[ + top color=black!5!white, + bottom color=black!20!white, + black!80!white, + ] (0.0,0.0) rectangle ++(2.0,0.8) + node[pos=.5,align=center,color=black] {HIP}; + \draw[-stealth, line width=0.5pt] (2.0, 0.4) -- ++(1.2,0.0+0.025); + + % SYCL + \draw[ + top color=black!5!white, + bottom color=black!20!white, + black!80!white, + ] (0.0,-1.2) rectangle ++(2.0,0.8) + node[pos=.5,align=center,color=black] {SYCL}; + \draw[-stealth, line width=0.5pt] (2.0, -0.8) -- ++(1.2,0.0+0.025); + + % MAGMA + \draw[ + top color=black!5!white, + bottom color=black!20!white, + black!80!white, + ] (0.0,-2.4) rectangle ++(2.0,0.8) + node[pos=.5,align=center,color=black] {MAGMA}; + \draw[-stealth, line width=0.5pt] (2.0, -2.0) -- ++(1.2,3.7-0.15); + \draw[-stealth, line width=0.5pt] (2.0, -2.0) -- ++(1.2,2.5-0.15); + + \end{scope} +\end{scope} + +\begin{scope}[shift={(9.6,0)}] + \begin{scope}[shift={(0,-0.6)}] + \node at (1.1,6.1) {\large Hardware}; + + % CPU + \draw[ + top color=ceed@green!20!white, + bottom color=ceed@green!60!white, + ceed@green!60!black, + ] (0.0,3.6) rectangle ++(2.2,0.8) + node[pos=.5,align=center,color=black] {CPU}; + + % CUDA GPU + \draw[ + top color=ceed@green!20!white, + bottom color=ceed@green!60!white, + ceed@green!60!black, + ] (0.0,1.2) rectangle ++(2.2,0.8) + node[pos=.5,align=center,color=black] {NVIDIA GPU}; + + % ROCm GPU + \draw[ + top color=ceed@green!20!white, + bottom color=ceed@green!60!white, + ceed@green!60!black, + ] (0.0,-0.0) rectangle ++(2.2,0.8) + node[pos=.5,align=center,color=black] {AMD GPU}; + + % Intel GPU + \draw[ + top color=ceed@green!20!white, + bottom color=ceed@green!60!white, + ceed@green!60!black, + ] (0.0,-1.2) rectangle ++(2.2,0.8) + node[pos=.5,align=center,color=black] {Intel GPU}; + + \end{scope} +\end{scope} + +\end{tikzpicture} +\end{document} diff --git a/doc/sphinx/requirements.txt b/doc/sphinx/requirements.txt index 76b40ca3ab..f4f8145a5c 100644 --- a/doc/sphinx/requirements.txt +++ b/doc/sphinx/requirements.txt @@ -1,11 +1,11 @@ altair>=5.0 -breathe>=4.30 -myst-parser[linkify]>=0.14.0 -sphinx-hoverxref>=0.3b1 +breathe>=4.36 +myst-parser[linkify]>=4.0.1 +sphinx-hoverxref>=1.4.2 sphinx-design -sphinx>=5.3,<6 +sphinx>=7.2 sphinx_rtd_theme -sphinxcontrib-bibtex==2.5 +sphinxcontrib-bibtex==2.6.3 sphinxcontrib-katex sphinxcontrib-mermaid sphinxcontrib-svg2pdfconverter diff --git a/doc/sphinx/source/conf.py b/doc/sphinx/source/conf.py index 2dbcfa648a..3f956d3785 100755 --- a/doc/sphinx/source/conf.py +++ b/doc/sphinx/source/conf.py @@ -40,7 +40,6 @@ extensions = [ "sphinxext_altair.altairplot", "breathe", - "hoverxref.extension", "sphinx_design", "myst_parser", "sphinx_rtd_theme", @@ -107,6 +106,7 @@ "examples/nek/README.md", "examples/petsc/README.md", "examples/solid/README.md", + "examples/deal.II/README.md", ] # The name of the Pygments (syntax highlighting) style to use. @@ -161,13 +161,6 @@ ] } -# hoverxref options -hoverxref_auto_ref = True -hoverxref_mathjax = True -hoverxref_role_types = { - "ref": "modal", -} - latex_macros = r""" \def \diff {\operatorname{d}\!} \def \tcolon {\!:\!} diff --git a/doc/sphinx/source/gettingstarted.md b/doc/sphinx/source/gettingstarted.md index 0f1a831b59..0369bbaa40 100644 --- a/doc/sphinx/source/gettingstarted.md +++ b/doc/sphinx/source/gettingstarted.md @@ -1,5 +1,6 @@ # Getting Started ```{include} ./README.md -:start-after: gettingstarted-inclusion-marker +:start-after: +:end-before: ``` diff --git a/doc/sphinx/source/gpu.md b/doc/sphinx/source/gpu.md index f7418ac5f3..6040e52a8d 100644 --- a/doc/sphinx/source/gpu.md +++ b/doc/sphinx/source/gpu.md @@ -7,11 +7,12 @@ Code that produces correct results with CPU backends will produce correct result The filepath to the user source code is passed in {c:func}`CeedQFunctionCreateInterior` as the `source` argument. This filepath should typically be an absolute path to ensure the JiT compilation can locate the source file. -The filepath may also be relative to a root directory set with {c:func}`CeedAddJitSourceRoot`. -The {c:macro}`CEED_QFUNCTION` macro automatically creates a string with the absolute path stored in the variable `user_loc` for a {c:type}`CeedQFunctionUser` called `user`. +The filepath may also be a relative path with respect to a root directory set with {c:func}`CeedAddJitSourceRoot`. +The {c:macro}`CEED_QFUNCTION` macro automatically creates a string with the absolute path, for example a {c:type}`CeedQFunctionUser` called `user` would have this string stored in the variable `user_loc`. -The entire contents of this file and all locally included files (`#include "foo.h"`) are used during JiT compilation for GPU backends. -Installed headers (`#include `) are omitted in the source code passed to JiT, but the compilation environment may supply common headers such as ``. +The entire contents of this source file and all included files are used during JiT compilation for GPU backends. +Include statements for system headers that are required for CPU compilation but are not available in GPU compilation environments should be guarded with `#ifdef CEED_RUNNING_JIT_PASS`. +Any function definitions in these system headers must still be available in the GPU compilation environments, such as the contents of ``. These source file must only contain syntax constructs supported by C99 and all targeted backends (i.e. CUDA for `/gpu/cuda`, OpenCL/SYCL for `/gpu/sycl`, etc.). All source files must be at the provided filepath at runtime for JiT to function. @@ -20,10 +21,10 @@ All source files must be at the provided filepath at runtime for JiT to function GPU backends require stricter adherence to memory access assumptions, but CPU backends may occasionally report correct results despite violations of memory access assumptions. Both `CeedVector` and `CeedQFunctionContext` have read-only and read-write accessors, and `CeedVector` allow write-only access. -Read-only access of `CeedVector` and `CeedQFunctionContext` memory spaces must be respected for proper GPU behavior. +Read-only access of `CeedVector` and `CeedQFunctionContext` memory spaces must be respected for to ensure proper GPU behavior. Write-only access of `CeedVector` memory spaces asserts that all data in the `CeedVector` is invalid until overwritten. -`CeedQFunction` assume that all input arrays are read-only and all output arrays are write-only and the {c:type}`CeedQFunctionUser` must adhere to these assumptions, only reading data in the input arrays and fully overwriting the output arrays. +`CeedQFunction` assume that all input arrays are read-only and all output arrays are write-only and the {c:type}`CeedQFunctionUser` must adhere to these assumptions, only reading data in the input arrays and fully overwriting all entries in the output arrays. Additionally, {c:type}`CeedQFunctionUser` have read-write access for `CeedQFunctionContext` data, unless {c:func}`CeedQFunctionSetContextWritable` was used to indicate that read-only access is sufficient. The `/cpu/self/memcheck` backends explicitly verify read-only and write-only memory access assumptions. diff --git a/doc/sphinx/source/index.md b/doc/sphinx/source/index.md index 82272c54d4..f35c1bb5a0 100644 --- a/doc/sphinx/source/index.md +++ b/doc/sphinx/source/index.md @@ -8,9 +8,9 @@ intro gettingstarted libCEEDapi examples/index +api/index ffi gpu -api/index precision libCEEDdev Contributing diff --git a/doc/sphinx/source/intro.md b/doc/sphinx/source/intro.md index 3c0d04e1ef..574672b31d 100644 --- a/doc/sphinx/source/intro.md +++ b/doc/sphinx/source/intro.md @@ -22,7 +22,7 @@ Furthermore, software packages that provide high-performance implementations hav libCEED's purely algebraic interface can unobtrusively be integrated in new and legacy software to provide performance portable interfaces. While libCEED's focus is on high-order finite elements, the approach is algebraic and thus applicable to other discretizations in factored form. libCEED's role, as a lightweight portable library that allows a wide variety of applications to share highly optimized discretization kernels, is illustrated in {numref}`fig-libCEED-backends`, where a non-exhaustive list of specialized implementations (backends) is provided. -libCEED provides a low-level Application Programming Interface (API) for user codes so that applications with their own discretization infrastructure (e.g., those in [PETSc](https://www.mcs.anl.gov/petsc/), [MFEM](https://mfem.org/) and [Nek5000](https://nek5000.mcs.anl.gov/)) can evaluate and use the core operations provided by libCEED. GPU implementations are available via pure [CUDA](https://developer.nvidia.com/about-cuda) and pure [HIP](https://rocmdocs.amd.com) as well as the [OCCA](http://github.com/libocca/occa) and [MAGMA](https://bitbucket.org/icl/magma) libraries. +libCEED provides a low-level Application Programming Interface (API) for user codes so that applications with their own discretization infrastructure (e.g., those in [PETSc](https://www.mcs.anl.gov/petsc/), [MFEM](https://mfem.org/) and [Nek5000](https://nek5000.mcs.anl.gov/)) can evaluate and use the core operations provided by libCEED. GPU implementations are available via pure [CUDA](https://developer.nvidia.com/about-cuda) and pure [HIP](https://rocmdocs.amd.com) as well as the [MAGMA](https://bitbucket.org/icl/magma) library. CPU implementations are available via pure C and AVX intrinsics as well as the [LIBXSMM](http://github.com/hfp/libxsmm) library. libCEED provides a unified interface, so that users only need to write a single source code and can select the desired specialized implementation at run time. Moreover, each process or thread can instantiate an arbitrary number of backends. diff --git a/doc/sphinx/source/libCEEDapi.md b/doc/sphinx/source/libCEEDapi.md index b43871e422..66caa15688 100644 --- a/doc/sphinx/source/libCEEDapi.md +++ b/doc/sphinx/source/libCEEDapi.md @@ -259,7 +259,7 @@ If greater than 1, the caller must ensure that the number of quadrature points ` This is often satisfied automatically due to the element size or by batching elements together to facilitate vectorization in other stages, and can always be ensured by padding. In addition to the function pointers (`setup` and `mass`), {ref}`CeedQFunction` constructors take a string representation specifying where the source for the implementation is found. -This is used by backends that support Just-In-Time (JIT) compilation (i.e., CUDA and OCCA) to compile for coprocessors. +This is used by backends that support Just-In-Time (JIT) compilation (i.e., CUDA and HIP) to compile for coprocessors. For full support across all backends, these {ref}`CeedQFunction` source files must only contain constructs mutually supported by C99, C++11, and CUDA. For example, explicit type casting of void pointers and explicit use of compatible arguments for {code}`math` library functions is required, and variable-length array (VLA) syntax for array reshaping is only available via libCEED's {code}`CEED_Q_VLA` macro. diff --git a/doc/sphinx/source/libCEEDdev.md b/doc/sphinx/source/libCEEDdev.md index 7a009ee811..e311171a79 100644 --- a/doc/sphinx/source/libCEEDdev.md +++ b/doc/sphinx/source/libCEEDdev.md @@ -1,55 +1,150 @@ # Developer Notes -## Style Guide +## Library Design -Please check your code for style issues by running +LibCEED has a single user facing API for creating and using the libCEED objects ({ref}`CeedVector`, {ref}`CeedBasis`, etc). +Different Ceed backends are selected by instantiating a different {ref}`Ceed` object to create the other libCEED objects, in a [bridge pattern](https://en.wikipedia.org/wiki/Bridge_pattern). +At runtime, the user can select the different backend implementations to target different hardware, such as CPUs or GPUs. -`make format` +When designing new features, developers should place the function definitions for the user facing API in the header `/include/ceed/ceed.h`. +The basic implementation of these functions should typically be placed in `/interface/*.c` files. +The interface should pass any computationally expensive or hardware specific operations to a backend implementation. +A new method for the associated libCEED object can be added in `/include/ceed-impl.h`, with a corresponding `CEED_FTABLE_ENTRY` in `/interface/ceed.c` to allow backends to set their own implementations of this method. +Then in the creation of the backend specific implementation of the object, typically found in `/backends/[impl]/ceed-[impl]-[object].c`, the developer creates the backend implementation of the specific method and calls {c:func}`CeedSetBackendFunction` to set this implementation of the method for the backend. +Any supplemental functions intended to be used in the interface or by the backends may be added to the backend API in the header `/include/ceed/backend.h`. +The basic implementation of these functions should also be placed in `/interface/*.c` files. -In addition to those automatically enforced style rules, libCEED tends to follow the following code style conventions: +LibCEED generally follows a "CPU first" implementation strategy when adding new functionality to the user facing API. +If there are no performance specific considerations, it is generally recommended to include a basic CPU default implementation in `/interface/*.c`. +Any new functions must be well documented and tested. +Once the user facing API and the default implementation are in place and verified correct via tests, then the developer can focus on hardware specific implementations (AVX, CUDA, HIP, etc.) as necessary. -- Variable names: `snake_case` -- Strut members: `snake_case` -- Function and method names: `PascalCase` or language specific style -- Type names: `PascalCase` or language specific style -- Constant names: `CAPS_SNAKE_CASE` or language specific style +## Backend Inheritance -Also, documentation files should have one sentence per line to help make git diffs clearer and less disruptive. +A Ceed backend is not required to implement all libCeed objects or {ref}`CeedOperator` methods. +There are three mechanisms by which a Ceed backend can inherit implementations from another Ceed backend. -## Clang-tidy +1. Delegation - Developers may use {c:func}`CeedSetDelegate` to set a general delegate {ref}`Ceed` object. + This delegate {ref}`Ceed` will provide the implementation of any libCeed objects that parent backend does not implement. + For example, the `/cpu/self/xsmm/serial` backend implements the `CeedTensorContract` object itself but delegates all other functionality to the `/cpu/self/opt/serial` backend. -Please check your code for common issues by running +2. Object delegation - Developers may use {c:func}`CeedSetObjectDelegate` to set a delegate {ref}`Ceed` object for a specific libCEED object. + This delegate {ref}`Ceed` will only provide the implementation of that specific libCeed object for the parent backend. + Object delegation has higher precedence than delegation. -`make tidy` +3. Operator fallback - Developers may use {c:func}`CeedSetOperatorFallbackCeed` to set a {ref}`Ceed` object to provide any unimplemented {ref}`CeedOperator` methods that support preconditioning, such as {c:func}`CeedOperatorLinearAssemble`. + The parent backend must implement the basic {ref}`CeedOperator` functionality. + Like the delegates above, this fallback {ref}`Ceed` object should be created and set in the backend `CeedInit` function. + In order to use operator fallback, the parent backend and fallback backend must use compatible E-vector and Q-vector layouts. + For example, `/gpu/cuda/gen` falls back to `/gpu/cuda/ref` for missing {ref}`CeedOperator` preconditioning support methods. + If an unimplemented method is called, then the parent `/gpu/cuda/gen` {ref}`Ceed` object uses its fallback `/gpu/cuda/ref` {ref}`Ceed` object to create a clone of the {ref}`CeedOperator`. + This clone {ref}`CeedOperator` is then used for the unimplemented preconditioning support methods. -which uses the `clang-tidy` utility included in recent releases of Clang. -This tool is much slower than actual compilation (`make -j8` parallelism helps). -To run on a single file, use +## Backend Families -`make interface/ceed.c.tidy` +There are 4 general 'families' of backend implementations. +As internal data layouts are specific to backend families, it is generally not possible to delegate between backend families. -for example. -All issues reported by `make tidy` should be fixed. +### CPU Backends -## Include-What-You-Use +The basic CPU with the simplest implementation is `/cpu/self/ref/serial`. +This backend contains the basic implementations of most objects that other backends rely upon. +Most of the other CPU backends only update the {ref}`CeedOperator` and `CeedTensorContract` objects. -Header inclusion for source files should follow the principal of 'include what you use' rather than relying upon transitive `#include` to define all symbols. +The `/cpu/self/ref/blockend` and `/cpu/self/opt/*` backends delegate to the `/cpu/self/ref/serial` backend. +The `/cpu/self/ref/blocked` backend updates the {ref}`CeedOperator` to use an E-vector and Q-vector ordering when data for 8 elements are interlaced to provide better vectorization. +The `/cpu/self/opt/*` backends update the {ref}`CeedOperator` to apply the action of the operator in 1 or 8 element batches, depending upon if the blocking strategy is used. +This reduced the memory required to utilize this backend significantly. -Every symbol that is used in the source file `foo.c` should be defined in `foo.c`, `foo.h`, or in a header file `#include`d in one of these two locations. -Please check your code by running the tool [`include-what-you-use`](https://include-what-you-use.org/) to see recommendations for changes to your source. -Most issues reported by `include-what-you-use` should be fixed; however this rule is flexible to account for differences in header file organization in external libraries. -If you have `include-what-you-use` installed in a sibling directory to libCEED or set the environment variable `IWYU_CC`, then you can use the makefile target `make iwyu`. +The `/cpu/self/avx/*` and `/cpu/self/xsmm/*` backends delegate to the corresponding `/cpu/self/opt/*` backends. +These backends update the `CeedTensorContract` objects using AVX intrinsics and libXSMM functions, respectively. -Header files should be listed in alphabetical order, with installed headers preceding local headers and `ceed` headers being listed first. -The `ceed-f64.h` and `ceed-f32.h` headers should only be included in `ceed.h`. +The `/cpu/self/memcheck/*` backends delegate to the `/cpu/self/ref/*` backends. +These backends replace many of the implementations with methods that include more verification checks and a memory management model that more closely matches the memory management for GPU backends. +These backends rely upon the [Valgrind](https://valgrind.org/) Memcheck tool and Valgrind headers. -```c -#include -#include -#include -#include -#include "ceed-avx.h" -``` +### GPU Backends + +The CUDA, HIP, and SYCL backend families all follow similar designs. +The CUDA and HIP backends are very similar, with minor differences. +While the SYCL backend was based upon the CUDA and HIP backends, there are more internal differences to accommodate OpenCL and Intel hardware. + +The `/gpu/*/ref` backends provide basic functionality. +In these backends, the operator is applied in multiple separate kernel launches, following the libCEED operator decomposition, where first {ref}`CeedElemRestriction` kernels map from the L-vectors to E-vectors, then {ref}`CeedBasis` kernels map from the E-vectors to Q-vectors, then the {ref}`CeedQFunction` kernel provides the action of the user quadrature point function, and the transpose {ref}`CeedBasis` and {ref}`CeedElemRestriction` kernels are applied to go back to the E-vectors and finally the L-vectors. +These kernels apply to all points across all elements in order to maximize the amount of work each kernel launch has. +Some of these kernels are compiled at runtime via NVRTC, HIPRTC, or OpenCL RTC. + +The `/gpu/*/shared` backends delegate to the corresponding `/gpu/*/ref` backends. +These backends use shared memory to improve performance for the {ref}`CeedBasis` kernels. +All other libCEED objects are delegated to `/gpu/*/ref`. +These kernels are compiled at runtime via NVRTC, HIPRTC, or OpenCL RTC. + +The `/gpu/*/gen` backends delegate to the corresponding `/gpu/*/shared` backends. +These backends write a single comprehensive kernel to apply the action of the {ref}`CeedOperator`, significantly improving performance by eliminating intermediate data structures and reducing the total number of kernel launches required. +This kernel is compiled at runtime via NVRTC, HIPRTC, or OpenCL RTC. + +The `/gpu/*/magma` backends delegate to the corresponding `/gpu/cuda/ref` and `/gpu/hip/ref` backends. +These backends provide better performance for {ref}`CeedBasis` kernels but do not have the improvements from the `/gpu/*/gen` backends for {ref}`CeedOperator`. + +## Internal Layouts + +Ceed backends are free to use any E-vector and Q-vector data layout (including never fully forming these vectors) so long as the backend passes the `t5**` series tests and all examples. +There are several common layouts for L-vectors, E-vectors, and Q-vectors, detailed below: + +- **L-vector** layouts + + - L-vectors described by a standard {ref}`CeedElemRestriction` have a layout described by the `offsets` array and `comp_stride` parameter. + Data for node `i`, component `j`, element `k` can be found in the L-vector at index `offsets[i + k*elem_size] + j*comp_stride`. + - L-vectors described by a strided {ref}`CeedElemRestriction` have a layout described by the `strides` array. + Data for node `i`, component `j`, element `k` can be found in the L-vector at index `i*strides[0] + j*strides[1] + k*strides[2]`. + +- **E-vector** layouts + + - If possible, backends should use {c:func}`CeedElemRestrictionSetELayout()` to use the `t2**` tests. + If the backend uses a strided E-vector layout, then the data for node `i`, component `j`, element `k` in the E-vector is given by `i*layout[0] + j*layout[1] + k*layout[2]`. + - Backends may choose to use a non-strided E-vector layout; however, the `t2**` tests will not function correctly in this case and these tests will need to be marked as allowable failures for this backend in the test suite. + +- **Q-vector** layouts + + - When the size of a {ref}`CeedQFunction` field is greater than `1`, data for quadrature point `i` component `j` can be found in the Q-vector at index `i + Q*j`, where `Q` is the total number of quadrature points in the Q-vector. + Backends are free to provide the quadrature points in any order. + - When the {ref}`CeedQFunction` field has `emode` `CEED_EVAL_GRAD`, data for quadrature point `i`, component `j`, derivative `k` can be found in the Q-vector at index `i + Q*j + Q*num_comp*k`. + - Backend developers must take special care to ensure that the data in the Q-vectors for a field with `emode` `CEED_EVAL_NONE` is properly ordered when the backend uses different layouts for E-vectors and Q-vectors. + +## CeedVector Array Access + +Backend implementations are expected to separately track 'owned' and 'borrowed' memory locations. +Backends are responsible for freeing 'owned' memory; 'borrowed' memory is set by the user and backends only have read/write access to 'borrowed' memory. +For any given precision and memory type, a backend should only have 'owned' or 'borrowed' memory, not both. + +Backends are responsible for tracking which memory locations contain valid data. +If the user calls {c:func}`CeedVectorTakeArray` on the only memory location that contains valid data, then the {ref}`CeedVector` is left in an *invalid state*. +To repair an *invalid state*, the user must set valid data by calling {c:func}`CeedVectorSetValue`, {c:func}`CeedVectorSetArray`, or {c:func}`CeedVectorGetArrayWrite`. + +Some checks for consistency and data validity with {ref}`CeedVector` array access are performed at the interface level. +All backends may assume that array access will conform to these guidelines: + +- Borrowed memory + + - {ref}`CeedVector` access to borrowed memory is set with {c:func}`CeedVectorSetArray` with `copy_mode = CEED_USE_POINTER` and revoked with {c:func}`CeedVectorTakeArray`. + The user must first call {c:func}`CeedVectorSetArray` with `copy_mode = CEED_USE_POINTER` for the appropriate precision and memory type before calling {c:func}`CeedVectorTakeArray`. + - {c:func}`CeedVectorTakeArray` cannot be called on a vector in a *invalid state*. + +- Owned memory + + - Owned memory can be allocated by calling {c:func}`CeedVectorSetValue` or by calling {c:func}`CeedVectorSetArray` with `copy_mode = CEED_COPY_VALUES`. + - Owned memory can be set by calling {c:func}`CeedVectorSetArray` with `copy_mode = CEED_OWN_POINTER`. + - Owned memory can also be allocated by calling {c:func}`CeedVectorGetArrayWrite`. + The user is responsible for manually setting the contents of the array in this case. + +- Data validity + + - Internal synchronization and user calls to {c:func}`CeedVectorSync` cannot be made on a vector in an *invalid state*. + - Calls to {c:func}`CeedVectorGetArray` and {c:func}`CeedVectorGetArrayRead` cannot be made on a vector in an *invalid state*. + - Calls to {c:func}`CeedVectorSetArray` and {c:func}`CeedVectorSetValue` can be made on a vector in an *invalid state*. + - Calls to {c:func}`CeedVectorGetArrayWrite` can be made on a vector in an *invalid* state. + Data synchronization is not required for the memory location returned by {c:func}`CeedVectorGetArrayWrite`. + The caller should assume that all data at the memory location returned by {c:func}`CeedVectorGetArrayWrite` is *invalid*. ## Shape @@ -65,10 +160,10 @@ For example, the comment means that it can be traversed as ```c -for (d=0; d +#include +#include +#include +#include "ceed-avx.h" +``` diff --git a/doc/sphinx/source/releasenotes.md b/doc/sphinx/source/releasenotes.md index 19300bc7c5..682e76f13b 100644 --- a/doc/sphinx/source/releasenotes.md +++ b/doc/sphinx/source/releasenotes.md @@ -11,6 +11,12 @@ On this page we provide a summary of the main API changes, new features and exam - Add `bool` field type for `CeedQFunctionContext` and related interfaces to use `bool` fields. - `CEED_BASIS_COLLOCATED` removed; users should only use `CEED_BASIS_NONE`. - Remove unneeded pointer for `CeedElemRestrictionGetELayout`. +- Change QFunction source include file handling in JiT compilers + - Add `CEED_RUNNING_JIT_PASS` compiler definition for wrapping header files that device JiT compilers cannot read + - Users should now prefer `#include ` rather than `#include ` in QFunction source files +- Require use of `Ceed*Destroy()` on Ceed objects returned from `Ceed*Get*()`. +- Rename `CeedCompositeOperatorCreate()` to `CeedOperatorCreateComposite()` for uniformity. +- Rename `CeedCompositeOperator*()` to `CeedOperatorComposite*()` for uniformity. ### New features @@ -18,11 +24,21 @@ On this page we provide a summary of the main API changes, new features and exam - Add `CeedElemRestrictionGetLLayout` to provide L-vector layout for strided `CeedElemRestriction` created with `CEED_BACKEND_STRIDES`. - Add `CeedVectorReturnCeed` and similar when parent `Ceed` context for a libCEED object is only needed once in a calling scope. - Enable `#pragma once` for all JiT source; remove duplicate includes in JiT source string before compilation. +- Allow user to set additional compiler options for CUDA and HIP JiT. +Specifically, directories set with `CeedAddJitSourceRoot(ceed, "foo/bar")` will be used to set `-Ifoo/bar` and defines set with `CeedAddJitDefine(ceed, "foo=bar")` will be used to set `-Dfoo=bar`. +- Added non-tensor basis support to code generation backends `/gpu/cuda/gen` and `/gpu/hip/gen`. +- Added support to code generation backends `/gpu/cuda/gen` and `/gpu/hip/gen` for operators with both tensor and non-tensor bases. +- Add `CeedGetGitVersion()` to access the Git commit and dirty state of the repository at build time. +- Add `CeedGetBuildConfiguration()` to access compilers, flags, and related information about the build environment. ### Examples - Add deal.II example with CEED BP suite. +### Maintainability + +- OCCA backends were retired. + (v0-12)= ## v0.12 (Oct 31, 2023) diff --git a/examples/Makefile b/examples/Makefile index d32f406f5a..4cb4a1ed9e 100644 --- a/examples/Makefile +++ b/examples/Makefile @@ -36,6 +36,9 @@ all: ceed mfem nek petsc fluids solids ceed: make CEED_DIR=$(CEED_DIR) -C ceed all +dealii: + $(RM) -rf deal.II/build + mfem: make CEED_DIR=$(CEED_DIR) MFEM_DIR=$(MFEM_DIR) -C mfem all @@ -53,10 +56,12 @@ fluids: solids: make CEED_DIR=$(CEED_DIR) PETSC_DIR=$(PETSC_DIR) PETSC_ARCH=$(PETSC_ARCH) -C solids all -clean: + +clean: dealii +make -C ceed clean +make -C mfem clean +make -C nek clean + +make -C rust-qfunctions clean +make -C petsc clean +make -C fluids clean +make -C solids clean diff --git a/examples/README.md b/examples/README.md index e1177992f0..006d6c0c71 100644 --- a/examples/README.md +++ b/examples/README.md @@ -9,11 +9,11 @@ For more details, please see the dedicated [documentation section](https://libce ## Bakeoff Problems -% bps-inclusion-marker + The Center for Efficient Exascale Discretizations (CEED) uses Bakeoff Problems (BPs) to test and compare the performance of high-order finite element implementations. The definitions of the problems are given on the ceed [website](https://ceed.exascaleproject.org/bps/). -Each of the following bakeoff problems that use external discretization libraries (such as MFEM, PETSc, and Nek5000) are located in the subdirectories `mfem/`, `petsc/`, and `nek5000/`, respectively. +Each of the following bakeoff problems that use external discretization libraries (such as deal.II, MFEM, PETSc, and Nek5000) are located in the subdirectories `deal.II/`, `mfem/`, `petsc/`, and `nek5000/`, respectively. Here we provide a short summary: @@ -22,6 +22,13 @@ Here we provide a short summary: :widths: auto * - User code - Supported BPs +* - `deal.II` + - * BP1 (scalar mass operator) with $Q=P+1$ + * BP2 (vector mass operator) with $Q=P+1$ + * BP3 (scalar Laplace operator) with $Q=P+1$ + * BP4 (vector Laplace operator) with $Q=P+1$ + * BP5 (collocated scalar Laplace operator) with $Q=P$ + * BP6 (collocated vector Laplace operator) with $Q=P$ * - `mfem` - * BP1 (scalar mass operator) with $Q=P+1$ * BP3 (scalar Laplace operator) with $Q=P+1$ @@ -46,16 +53,16 @@ The BPs are parametrized by the number $P$ of Gauss-Legendre-Lobatto nodal point A $Q$-point Gauss-Legendre quadrature is used for all BPs except BP5 and BP6, which choose $Q = P$ and Gauss-Legendre-Lobatto quadrature to collocate with the interpolation nodes. This latter choice is popular in applications that use spectral element methods because it produces a diagonal mass matrix (enabling easy explicit time integration) and significantly reduces the number of floating point operations to apply the operator. -% bps-exclusion-marker + For a more detailed description of the operators employed in the BPs, please see the dedicated [BPs documentation section](https://libceed.org/en/latest/examples/bps.html). -## PETSc+libCEED Navier-Stokes Solver +## PETSc+libCEED Fluid Dynamics Navier-Stokes Mini-App The Navier-Stokes problem solves the compressible Navier-Stokes equations using an explicit or implicit time integration. A more detailed description of the problem formulation can be found in the [fluids/](./fluids) folder and the corresponding [fluids documentation page](https://libceed.org/en/latest/examples/fluids/index.html). -## PETSc+libCEED Solid mechanics elasticity mini-app +## PETSc+libCEED Solid Mechanics Elasticity Mini-App This example solves the steady-state static momentum balance equations using unstructured high-order finite/spectral element spatial discretizations. A more detailed description of the problem formulation can be found in the [solids/](./solids) folder and the corresponding [solids documentation page](https://libceed.org/en/latest/examples/solids/index.html). @@ -70,11 +77,20 @@ For a detailed description, please see the corresponding [area documentation pag These examples, located in the [petsc/](./petsc) folder, reproduce the Bakeoff Problems 1-6 on a discrete cubed-sphere, using PETSc. For a detailed description, please see the corresponding [problems on the cubed-sphere documentation page](https://libceed.org/en/latest/examples/petsc/index.html#bakeoff-problems-on-the-cubed-sphere). +## libCEED Python Examples + +These Jupyter notebooks explore the concepts of the libCEED API, including how to install the Python interface and the usage of each API object, with interactive examples. +The basic libCEED C examples in `/ceed` folder are also available as Python examples. + +## libCEED Rust Examples + +The basic libCEED C examples in `/ceed` folder are also available as Rust examples. + ## Running Examples -To build the examples, set the `MFEM_DIR`, `PETSC_DIR`, and `NEK5K_DIR` variables and, from the `examples/` directory, run +To build the examples, set the `DEAL_II_DIR`, `MFEM_DIR`, `PETSC_DIR`, and `NEK5K_DIR` variables and, from the `examples/` directory, run ```{include} ../README.md -:start-after: running-examples-inclusion-marker -:end-before: benchmarks-marker +:start-after: +:end-before: ``` diff --git a/examples/bps.md b/examples/bps.md index 47ba00e80e..7014c71f77 100644 --- a/examples/bps.md +++ b/examples/bps.md @@ -3,8 +3,8 @@ # CEED Bakeoff Problems ```{include} ./README.md -:start-after: bps-inclusion-marker -:end-before: bps-exclusion-marker +:start-after: +:end-before: ``` (mass-operator)= diff --git a/examples/ceed/.gitignore b/examples/ceed/.gitignore index 9f00fb96a8..9250d2275b 100644 --- a/examples/ceed/.gitignore +++ b/examples/ceed/.gitignore @@ -1,2 +1,3 @@ ex1-volume ex2-surface +ex3-volume diff --git a/examples/ceed/Makefile b/examples/ceed/Makefile index 57528cc1cd..db88064a1e 100644 --- a/examples/ceed/Makefile +++ b/examples/ceed/Makefile @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors +# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors # All Rights Reserved. See the top-level LICENSE and NOTICE files for details. # # SPDX-License-Identifier: BSD-2-Clause @@ -9,7 +9,7 @@ OPT ?= -O -g # Ceed directory CEED_DIR ?= ../.. -CEED_FLAGS ?= -I$(CEED_DIR)/include -std=c99 $(OPT) +CEED_FLAGS ?= -I$(CEED_DIR)/include -std=c11 $(OPT) CEED_LIBS ?= -Wl,-rpath,$(abspath $(CEED_DIR)/lib) -L$(CEED_DIR)/lib -lceed -lm EXAMPLES.c = $(wildcard ex*.c) diff --git a/examples/ceed/README.md b/examples/ceed/README.md index 6d4543b2e3..0d6c64188c 100644 --- a/examples/ceed/README.md +++ b/examples/ceed/README.md @@ -1,4 +1,4 @@ -## libCEED: Basic Examples +## libCEED Basic Examples Two examples are provided that rely only upon libCEED without any external libraries. @@ -9,3 +9,8 @@ This example uses the mass matrix to compute the length, area, or volume of a re ### Example 2: ex2-surface This example uses the diffusion matrix to compute the surface area of a region, in 1D, 2D or 3D, depending upon runtime parameters. + +### Example 3: ex3-volume + +This example uses the mass matrix to compute the length, area, or volume of a region, depending upon runtime parameters. +Unlike ex1, this example also adds the diffusion matrix to add a zero contribution to this calculation while demonstrating the ability of libCEED to handle multiple basis evaluation modes on the same input and output vectors. diff --git a/examples/ceed/ex1-volume-f-c.h b/examples/ceed/ex1-volume-f-c.h new file mode 100644 index 0000000000..a3316192ff --- /dev/null +++ b/examples/ceed/ex1-volume-f-c.h @@ -0,0 +1,59 @@ +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. +// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +// +// SPDX-License-Identifier: BSD-2-Clause +// +// This file is part of CEED: http://github.com/ceed + +#include + +/// libCEED Q-function for building quadrature data for a mass operator +CEED_QFUNCTION(build_mass)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { + long long int *build_data = (long long int *)ctx; + + // in[0] is Jacobians with shape [dim, dim, Q] + // in[1] is quadrature weights with shape [1, Q] + const CeedScalar *w = in[1]; + CeedScalar *q_data = out[0]; + + switch (build_data[0] + 10 * build_data[1]) { + case 11: { + const CeedScalar(*J)[1][CEED_Q_VLA] = (const CeedScalar(*)[1][CEED_Q_VLA])in[0]; + + // Quadrature Point Loop + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { q_data[i] = J[0][0][i] * w[i]; } // End of Quadrature Point Loop + } break; + case 22: { + const CeedScalar(*J)[2][CEED_Q_VLA] = (const CeedScalar(*)[2][CEED_Q_VLA])in[0]; + + // Quadrature Point Loop + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { + q_data[i] = (J[0][0][i] * J[1][1][i] - J[0][1][i] * J[1][0][i]) * w[i]; + } // End of Quadrature Point Loop + } break; + case 33: { + const CeedScalar(*J)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[0]; + + // Quadrature Point Loop + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { + q_data[i] = + (J[0][0][i] * (J[1][1][i] * J[2][2][i] - J[1][2][i] * J[2][1][i]) - J[0][1][i] * (J[1][0][i] * J[2][2][i] - J[1][2][i] * J[2][0][i]) + + J[0][2][i] * (J[1][0][i] * J[2][1][i] - J[1][1][i] * J[2][0][i])) * + w[i]; + } // End of Quadrature Point Loop + } break; + } + return CEED_ERROR_SUCCESS; +} + +/// libCEED Q-function for applying a mass operator +CEED_QFUNCTION(apply_mass)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { + // in[0], out[0] are solution variables with shape [1, Q] + // in[1] is quadrature data with shape [1, Q] + const CeedScalar *u = in[0], *q_data = in[1]; + CeedScalar *v = out[0]; + + // Quadrature Point Loop + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { v[i] = q_data[i] * u[i]; } // End of Quadrature Point Loop + return CEED_ERROR_SUCCESS; +} diff --git a/examples/ceed/ex1-volume-f.f90 b/examples/ceed/ex1-volume-f.f90 new file mode 100644 index 0000000000..580874efc2 --- /dev/null +++ b/examples/ceed/ex1-volume-f.f90 @@ -0,0 +1,557 @@ +! Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. +! All Rights Reserved. See the top-level LICENSE and NOTICE files for details. + +! SPDX-License-Identifier: BSD-2-Clause + +! This file is part of CEED: http:Cgithub.com/ceed + +! libCEED Example 1 + +! This example illustrates a simple usage of libCEED to compute the volume of a 3D body using matrix-free application of a mass operator. +! Arbitrary mesh and solution degrees in 1D, 2D and 3D are supported from the same code. + +! The example has no dependencies, and is designed to be self-contained. +! For additional examples that use external discretization libraries (MFEM, PETSc, etc.) see the subdirectories in libceed/examples. + +! All libCEED objects use a Ceed device object constructed based on a command line argument (-ceed). + +! Build with: + +! make ex1-volume [CEED_DIR = ] + +! Sample runs: + +! ./ex1-volume-f +! ./ex1-volume-f -ceed /cpu/self +! ./ex1-volume-f -ceed /gpu/cuda + +! Test in 1D-3D +! TESTARGS(name = "1D User QFunction") -ceed {ceed_resource} -d 1 -t +! TESTARGS(name = "2D User QFunction") -ceed {ceed_resource} -d 2 -t +! TESTARGS(name = "3D User QFunction") -ceed {ceed_resource} -d 3 -t +! TESTARGS(name = "1D Gallery QFunction") -ceed {ceed_resource} -d 1 -t -g +! TESTARGS(name = "2D Gallery QFunction") -ceed {ceed_resource} -d 2 -t -g +! TESTARGS(name = "3D Gallery QFunction") -ceed {ceed_resource} -d 3 -t -g + +!> @file +!> libCEED example using mass operator to compute volume + + include 'ex1-volume-f.h' + +!----------------------------------------------------------------------- +subroutine getcartesianmeshsize(fe_dim, degree, prob_size, num_xyz) + implicit none + integer fe_dim + integer degree + integer prob_size + integer num_xyz(3) + + integer num_elem + integer s, r, d, sd + num_elem = prob_size/(degree**fe_dim) + s = 0 + +! Use the approximate formula: +! prob_size ~ num_elem * degree^dim +! find s: num_elem/2 < 2^s <= num_elem + + do while (num_elem > 1) + num_elem = num_elem/2 + s = s + 1 + end do + r = mod(s, fe_dim) + + do d = 1, fe_dim + sd = s/fe_dim + if (r > 0) then + sd = sd + 1 + r = r - 1 + end if + num_xyz(d) = ISHFT(1, sd) + end do +end + +!----------------------------------------------------------------------- +subroutine buildcartesianrestriction(ceed, fe_dim, num_xyz, degree, num_comp, mesh_size, num_qpts, restriction,& +& q_data_restriction, err) + implicit none + include 'ceed/fortran.h' + + integer ceed + integer fe_dim + integer num_xyz(3) + integer degree + integer num_comp + integer mesh_size + integer num_qpts + integer restriction + integer q_data_restriction + integer err + + integer p + integer num_nodes + integer elem_qpts + integer num_elem + integer scalar_size + integer nd(3) + integer elem_nodes_size + integer e_xyz(3), re + integer g_nodes, g_nodes_stride, r_nodes + integer, dimension (:), allocatable :: elem_nodes + + integer i, j, k + + p = degree + 1 + num_nodes = p**fe_dim + elem_qpts = num_qpts**fe_dim + num_elem = 1 + scalar_size = 1 + + do i = 1, fe_dim + num_elem = num_elem * num_xyz(i) + nd(i) = num_xyz(i) * (p - 1) + 1 + scalar_size = scalar_size*nd(i) + end do + mesh_size = scalar_size*num_comp +! elem: 0 1 n-1 +! |---*-...-*---|---*-...-*---|- ... -|--...--| +! num_nodes: 0 1 p-1 p p+1 2*p n*p + elem_nodes_size = num_elem*num_nodes + allocate (elem_nodes(elem_nodes_size)) + + do i = 1, num_elem + e_xyz(1) = 1 + e_xyz(2) = 1 + e_xyz(3) = 1 + re = i - 1 + + do j = 1, fe_dim + e_xyz(j) = mod(re, num_xyz(j)) + re = re/num_xyz(j) + end do + + do j = 1, num_nodes + g_nodes = 0 + g_nodes_stride = 1 + r_nodes = j - 1 + + do k = 1, fe_dim + g_nodes = g_nodes + (e_xyz(k) * (p - 1) + mod(r_nodes, p)) * g_nodes_stride + g_nodes_stride = g_nodes_stride * nd(k) + r_nodes = r_nodes/p + end do + elem_nodes((i - 1) * num_nodes + j) = g_nodes + end do + end do + + call ceedelemrestrictioncreate(ceed, num_elem, num_nodes, num_comp, scalar_size, mesh_size, ceed_mem_host,& + &ceed_copy_values, elem_nodes, restriction, err) + if (q_data_restriction /= ceed_qfunction_none) then + call ceedelemrestrictioncreatestrided(ceed, num_elem, elem_qpts, num_comp, num_comp * elem_qpts * num_elem,& + &ceed_strides_backend, q_data_restriction, err) + end if + deallocate (elem_nodes) +end + +!----------------------------------------------------------------------- +subroutine transformmeshcoords(fe_dim, mesh_size, coords, exact_volume, err) + implicit none + + integer fe_dim + integer mesh_size, scalar_size + real*8 coords(mesh_size) + real*8 exact_volume + real*8 m_pi, m_pi_2 + parameter(m_pi = 3.14159265358979323846d0) + parameter(m_pi_2 = 1.57079632679489661923d0) + integer err + + integer i + real*8 u, v + + scalar_size = mesh_size/fe_dim + select case (fe_dim) + case (1) + do i = 1, scalar_size + coords(i) = 0.5d0 + (1.d0/sqrt(3.d0)) * sin((2.d0/3.d0) * m_pi * (coords(i) - 0.5d0)) + end do + exact_volume = 1.d0 + + case (2, 3) + do i = 1, scalar_size + u = 1.d0 + coords(i) + v = m_pi_2 * coords(i + scalar_size) + + coords(i) = u * cos(v) + coords(i + scalar_size) = u * sin(v) + end do + exact_volume = 3.d0/4.d0 * m_pi + end select +end + +!----------------------------------------------------------------------- +subroutine setcartesianmeshcoords(fe_dim, num_xyz, mesh_degree, mesh_coords, exact_volume, err) + implicit none + include 'ceed/fortran.h' + + integer fe_dim + integer num_xyz(3) + integer mesh_degree + integer mesh_coords + real*8 exact_volume + integer err + + integer p + integer scalar_size + integer coords_size + integer r_nodes + integer d_1d + integer nd(3) + real*8, dimension (:), allocatable :: nodes, qpts + real*8, dimension (:), allocatable :: coords + integer*8 offset + integer i, j + p = mesh_degree + 1 + scalar_size = 1 + + do i = 1, fe_dim + nd(i) = num_xyz(i) * (p - 1) + 1 + scalar_size = scalar_size * nd(i) + end do + + coords_size = scalar_size * fe_dim + allocate (coords(coords_size)) + +! The H1 basis uses Lobatto quadrature points as nodes + allocate (nodes(p)) + allocate (qpts(p)) + call ceedlobattoquadrature(p, nodes, qpts, err) + deallocate(qpts) + do i = 1, p + nodes(i) = 0.5 + 0.5 * nodes(i) + end do + + do i = 1, scalar_size + r_nodes = i - 1 + + do j = 1, fe_dim + d_1d = mod(r_nodes, nd(j)) + coords(scalar_size * (j - 1) + i) = ((d_1d/(p - 1)) + nodes(mod(d_1d, p - 1) + 1))/num_xyz(j) + r_nodes = r_nodes/nd(j) + end do + end do + deallocate(nodes) + + call transformmeshcoords(fe_dim, coords_size, coords, exact_volume, err) + + offset = 0 + call ceedvectorsetarray(mesh_coords, ceed_mem_host, ceed_copy_values, coords, offset, err) + deallocate(coords) +end + +!----------------------------------------------------------------------- +program main + implicit none + include 'ceed/fortran.h' + + character ceed_spec*32 + integer fe_dim, num_comp_x, mesh_degree, sol_degree, num_qpts + integer num_elem, num_xyz(3), elem_qpts + integer prob_size, mesh_size, sol_size + integer help, test, gallery, benchmark + integer i, num_args, err + character arg*32, arg_value*32 + real*8 exact_volume, computed_volume + + integer ceed + real*8, dimension (:), allocatable :: u_array, v_array + integer mesh_coords, q_data, u, v + integer mesh_restriction, sol_restriction, q_data_restriction + integer mesh_basis, sol_basis + integer*8 offset + integer build_ctx + integer build_ctx_size + parameter(build_ctx_size = 2) + integer*8 build_ctx_data(build_ctx_size) + integer qf_build, qf_apply + integer op_build, op_apply + + external build_mass, apply_mass + +! Initial values + ceed_spec = '/cpu/self' + fe_dim = 3 + num_comp_x = 3 + mesh_degree = 4 + sol_degree = 4 + num_qpts = mesh_degree + 2 + prob_size = -1 + help = 0 + test = 0 + gallery = 0 + benchmark = 0 + +! Process command line arguments + + num_args = command_argument_count() + do i = 1, num_args + call get_command_argument(i, arg) + + select case (arg) +! LCOV_EXCL_START + case ('-h') + help = 1 + + case ('-c', '-ceed') + call get_command_argument(i + 1, ceed_spec) + + case ('-d') + call get_command_argument(i + 1, arg_value) + read(arg_value, '(I10)') fe_dim + num_comp_x = fe_dim + + case ('-m') + call get_command_argument(i + 1, arg_value) + read(arg_value, '(I10)') mesh_degree + + case ('-p') + call get_command_argument(i + 1, arg_value) + read(arg_value, '(I10)') sol_degree + + case ('-q') + call get_command_argument(i + 1, arg_value) + read(arg_value, '(I10)') num_qpts + + case ('-s') + call get_command_argument(i + 1, arg_value) + read(arg_value, '(I10)') prob_size + + case ('-b') + call get_command_argument(i + 1, arg_value) + read(arg_value, '(I10)') benchmark +! LCOV_EXCL_STOP + + case ('-t') + test = 1 + + case ('-g') + gallery = 1 + end select + end do + + if (prob_size < 0) then + if (test == 1) then + prob_size = 8 * 16 + else + prob_size = 256 * 1024 + end if + end if + +! Print options + if ((test /= 1) .OR. (help == 1)) then +! LCOV_EXCL_START + write (*, *) 'Selected options: [command line option] : ' + write (*, *) ' Ceed specification [-c] : ', ceed_spec + write (*, *) ' Mesh dimension [-d] : ', fe_dim + write (*, *) ' Mesh degree [-m] : ', mesh_degree + write (*, *) ' Solution degree [-p] : ', sol_degree + write (*, *) ' Num. 1D quadrature pts [-q] : ', num_qpts + write (*, *) ' Approx. # unknowns [-s] : ', prob_size + if (gallery == 1) then + write (*, *) ' QFunction source [-g] : gallery' + else + write (*, *) ' QFunction source [-g] : header' + end if + if (help == 1) then + if (test == 0) then + write (*, *) 'Test/quiet mode is OFF (use -t to enable)' + else + write (*, *) 'Test/quiet mode is ON' + end if + end if +! LCOV_EXCL_STOP + end if + +! Select appropriate backend and logical device based on the (-ceed) command line argument + call ceedinit(trim(ceed_spec)//char(0), ceed, err) + +! Construct the mesh and solution bases + call ceedbasiscreatetensorh1lagrange(ceed, fe_dim, num_comp_x, mesh_degree + 1, num_qpts, ceed_gauss, mesh_basis,& + &err) + call ceedbasiscreatetensorh1lagrange(ceed, fe_dim, 1, sol_degree + 1, num_qpts, ceed_gauss, sol_basis, err) + +! Determine the mesh size based on the given approximate problem size + call getcartesianmeshsize(fe_dim, sol_degree, prob_size, num_xyz) + if (test == 0) then +! LCOV_EXCL_START + write (*, '(A16, I8)', advance='no') 'Mesh size: nx = ', num_xyz(1) + if (num_comp_x > 1) then + write (*, '(A7, I8)', advance='no') ', ny = ', num_xyz(2) + end if + if (num_comp_x > 2) then + write (*, '(A7, I8)', advance='no') ', nz = ', num_xyz(3) + end if + write (*, *) +! LCOV_EXCL_STOP + endif + +! Build CeedElemRestriction objects describing the mesh and solution discrete representation + call buildcartesianrestriction(ceed, fe_dim, num_xyz, mesh_degree, num_comp_x, mesh_size, num_qpts,& + &mesh_restriction, ceed_qfunction_none, err) + call buildcartesianrestriction(ceed, fe_dim, num_xyz, sol_degree, 1, sol_size, num_qpts, sol_restriction,& + &q_data_restriction, err) + + if (test == 0) then +! LCOV_EXCL_START + write (*, *) 'Number of mesh nodes : ', mesh_size/fe_dim + write (*, *) 'Number of solution nodes : ', sol_size +! LCOV_EXCL_STOP + end if + +! Create a CeedVector with the mesh coordinates +! Apply a transformation to the mesh + call ceedvectorcreate(ceed, mesh_size, mesh_coords, err) + call setcartesianmeshcoords(fe_dim, num_xyz, mesh_degree, mesh_coords, exact_volume, err) + +! Context data to be passed to the 'build_mass' QFunction + build_ctx_data(1) = fe_dim + build_ctx_data(2) = num_comp_x + call ceedqfunctioncontextcreate(ceed, build_ctx, err) +! Note: The context technically only takes arrays of double precision values, but we can pass arrays of ints of the same length + offset = 0 + call ceedqfunctioncontextsetdata(build_ctx, ceed_mem_host, ceed_use_pointer, build_ctx_size, build_ctx_data,& + &offset, err) + +! Create the QFunction that builds the mass operator (i.e. computes its quadrature data) and set its context data + if (gallery == 1) then + select case (fe_dim) + case (1) + call ceedqfunctioncreateinteriorbyname(ceed, 'Mass1DBuild', qf_build, err) + + case (2) + call ceedqfunctioncreateinteriorbyname(ceed, 'Mass2DBuild', qf_build, err) + + case (3) + call ceedqfunctioncreateinteriorbyname(ceed, 'Mass3DBuild', qf_build, err) + end select + else + call ceedqfunctioncreateinterior(ceed, 1, build_mass,& + &SOURCE_DIR& + &//'ex1-volume-f-c.h:build_mass'//char(0), qf_build, err) + call ceedqfunctionaddinput(qf_build, 'dx', num_comp_x * fe_dim, ceed_eval_grad, err) + call ceedqfunctionaddinput(qf_build, 'weights', 1, ceed_eval_weight, err) + call ceedqfunctionaddoutput(qf_build, 'qdata', 1, ceed_eval_none, err) + call ceedqfunctionsetcontext(qf_build, build_ctx, err) + end if + +! Create the operator that builds the quadrature data for the mass operator + call ceedoperatorcreate(ceed, qf_build, ceed_qfunction_none, ceed_qfunction_none, op_build, err) + call ceedoperatorsetfield(op_build, 'dx', mesh_restriction, mesh_basis, ceed_vector_active, err) + call ceedoperatorsetfield(op_build, 'weights', ceed_elemrestriction_none, mesh_basis, ceed_vector_none, err) + call ceedoperatorsetfield(op_build, 'qdata', q_data_restriction, ceed_basis_none, ceed_vector_active, err) + +! Compute the quadrature data for the mass operator + num_elem = 1 + elem_qpts = num_qpts**fe_dim + do i = 1, fe_dim + num_elem = num_elem * num_xyz(i) + end do + call ceedvectorcreate(ceed, num_elem * elem_qpts, q_data, err) + call ceedoperatorapply(op_build, mesh_coords, q_data, ceed_request_immediate, err) + +! Create the QFunction that defines the action of the mass operator + if (gallery == 1) then + call ceedqfunctioncreateinteriorbyname(ceed, 'MassApply', qf_apply, err) + else + call ceedqfunctioncreateinterior(ceed, 1, apply_mass,& + &SOURCE_DIR& + &//'ex1-volume-f-c.h:apply_mass'//char(0), qf_apply, err) + call ceedqfunctionaddinput(qf_apply, 'u', 1, ceed_eval_interp, err) + call ceedqfunctionaddinput(qf_apply, 'qdata', 1, ceed_eval_none, err) + call ceedqfunctionaddoutput(qf_apply, 'v', 1, ceed_eval_interp, err) + end if + +! Create the mass operator + call ceedoperatorcreate(ceed, qf_apply, ceed_qfunction_none, ceed_qfunction_none, op_apply, err) + call ceedoperatorsetfield(op_apply, 'u', sol_restriction, sol_basis, ceed_vector_active, err) + call ceedoperatorsetfield(op_apply, 'qdata', q_data_restriction, ceed_basis_none, q_data, err) + call ceedoperatorsetfield(op_apply, 'v', sol_restriction, sol_basis, ceed_vector_active, err) + +! Create auxiliary solution-size vectors + allocate (u_array(sol_size)) + allocate (v_array(sol_size)) + + call ceedvectorcreate(ceed, sol_size, u, err) + offset = 0 + call ceedvectorsetarray(u, ceed_mem_host, ceed_use_pointer, u_array, offset, err) + call ceedvectorcreate(ceed, sol_size, v, err) + offset = 0 + call ceedvectorsetarray(v, ceed_mem_host, ceed_use_pointer, v_array, offset, err) + +! Initialize 'u' with ones + call ceedvectorsetvalue(u, 1.d0, err) + +! Compute the mesh volume using the mass operator: volume = 1^T \cdot M \cdot 1 + call ceedoperatorapply(op_apply, u, v, ceed_request_immediate, err) + +! Benchmark runs + if (test /= 1 .AND. benchmark /= 0) then +! LCOV_EXCL_START + write (*, *) ' Executing ', benchmark, ' benchmarking runs...' +! LCOV_EXCL_STOP + end if + do i = 1, benchmark +! LCOV_EXCL_START + call ceedoperatorapply(op_apply, u, v, ceed_request_immediate, err) +! LCOV_EXCL_STOP + end do + +! Compute and print the sum of the entries of 'v' giving the mesh volume + computed_volume = 0.d0 + + call ceedvectorgetarrayread(v, ceed_mem_host, v_array, offset, err) + do i = 1, sol_size + computed_volume = computed_volume + v_array(offset + i) + end do + call ceedvectorrestorearrayread(v, v_array, offset, err) + + if (test /= 1) then +! LCOV_EXCL_START + write (*, *) ' done.' + write (*, *) 'Exact mesh volume :', exact_volume + write (*, *) 'Computed mesh volume :', computed_volume + write (*, *) 'Volume error :', (exact_volume - computed_volume) +! LCOV_EXCL_STOP + else + if (fe_dim == 1) then + if (abs(exact_volume - computed_volume) > 200.d0 * 1e-15) then +! LCOV_EXCL_START + write (*, *) 'Volume error : ', (exact_volume - computed_volume) +! LCOV_EXCL_STOP + end if + else + if (abs(exact_volume - computed_volume) > 1e-5) then +! LCOV_EXCL_START + write (*, *) 'Volume error : ', (exact_volume - computed_volume) +! LCOV_EXCL_STOP + end if + end if + end if + +! Free dynamically allocated memory + call ceedvectordestroy(mesh_coords, err) + call ceedvectordestroy(q_data, err) + call ceedvectordestroy(u, err) + call ceedvectordestroy(v, err) + deallocate (u_array) + deallocate (v_array) + call ceedbasisdestroy(sol_basis, err) + call ceedbasisdestroy(mesh_basis, err) + call ceedqfunctioncontextdestroy(build_ctx, err) + call ceedqfunctiondestroy(qf_build, err) + call ceedqfunctiondestroy(qf_apply, err) + call ceedoperatordestroy(op_build, err) + call ceedoperatordestroy(op_apply, err) + call ceeddestroy(ceed, err) +end +!----------------------------------------------------------------------- diff --git a/examples/ceed/ex1-volume-f.h b/examples/ceed/ex1-volume-f.h new file mode 100644 index 0000000000..08ea68ef6f --- /dev/null +++ b/examples/ceed/ex1-volume-f.h @@ -0,0 +1,55 @@ +!----------------------------------------------------------------------- +subroutine build_mass(ctx, q, j, w, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15, u16,& + qdata, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, ierr) + integer*8 ctx(2) + integer*8 fe_dim, space_dim +! j is Jacobians with shape [dim, dim, Q] +! w is quadrature weights with shape [1, Q] + real*8 j(1) + real*8 w(1) +! qdata is quadrature data with shape [1, Q] + real*8 qdata(1) + integer q, ierr + + fe_dim = ctx(1) + space_dim = ctx(2) + + select case (fe_dim + 10*space_dim) + case (11) + do i = 1, q + qdata(i) = j(i) * w(i) + end do + + case (22) + do i = 1, q + qdata(i) = (j(0*q + i)*j(3*q + i) - j(1*q + i)*j(2*q + i)) * w(i) + end do + + case (33) + do i = 1, q + qdata(i) = (j(0*q + i) * (j(4*q + i)*j(8*q + i) - j(5*q + i)*j(7*q + i)) -& + &j(1*q + i) * (j(3*q + i)*j(8*q + i) - j(5*q + i)*j(6*q + i)) +& + &j(2*q + i) * (j(3*q + i)*j(7*q + i) - j(4*q + i)*j(6*q + i))) * w(i) + end do + end select + ierr = 0 +end + +!----------------------------------------------------------------------- +subroutine apply_mass(ctx, q, u, qdata, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15, u16,& + v, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, ierr) + integer*8 ctx +! u is solution variables with shape [1, Q] +! qdata is quadrature data with shape [1, Q] + real*8 u(1) + real*8 qdata(1) +! v is solution variables with shape [1, Q] + real*8 v(1) + integer q, ierr + + do i = 1, q + v(i) = qdata(i) * u(i) + end do + ierr = 0 +end +!----------------------------------------------------------------------- diff --git a/examples/ceed/ex1-volume.c b/examples/ceed/ex1-volume.c index 04852f28c1..354f977113 100644 --- a/examples/ceed/ex1-volume.c +++ b/examples/ceed/ex1-volume.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -60,7 +60,7 @@ int main(int argc, const char *argv[]) { CeedInt sol_degree = 4; // polynomial degree for the solution CeedInt num_qpts = sol_degree + 2; // number of 1D quadrature points CeedInt prob_size = -1; // approximate problem size - CeedInt help = 0, test = 0, gallery = 0; + CeedInt help = 0, test = 0, gallery = 0, benchmark = 0; // Process command line arguments. for (int ia = 1; ia < argc; ia++) { @@ -81,6 +81,8 @@ int main(int argc, const char *argv[]) { parse_error = next_arg ? num_qpts = atoi(argv[++ia]), 0 : 1; } else if (!strcmp(argv[ia], "-s")) { parse_error = next_arg ? prob_size = atoi(argv[++ia]), 0 : 1; + } else if (!strcmp(argv[ia], "-b")) { + parse_error = next_arg ? benchmark = atoi(argv[++ia]), 0 : 1; } else if (!strcmp(argv[ia], "-t")) { test = 1; } else if (!strcmp(argv[ia], "-g")) { @@ -115,15 +117,18 @@ int main(int argc, const char *argv[]) { // Select appropriate backend and logical device based on the (-ceed) command line argument. Ceed ceed; + CeedInit(ceed_spec, &ceed); // Construct the mesh and solution bases. CeedBasis mesh_basis, sol_basis; + CeedBasisCreateTensorH1Lagrange(ceed, dim, num_comp_x, mesh_degree + 1, num_qpts, CEED_GAUSS, &mesh_basis); CeedBasisCreateTensorH1Lagrange(ceed, dim, 1, sol_degree + 1, num_qpts, CEED_GAUSS, &sol_basis); // Determine the mesh size based on the given approximate problem size. CeedInt num_xyz[dim]; + GetCartesianMeshSize(dim, sol_degree, prob_size, num_xyz); if (!test) { // LCOV_EXCL_START @@ -137,6 +142,7 @@ int main(int argc, const char *argv[]) { // Build CeedElemRestriction objects describing the mesh and solution discrete representations. CeedInt mesh_size, sol_size; CeedElemRestriction mesh_restriction, sol_restriction, q_data_restriction; + BuildCartesianRestriction(ceed, dim, num_xyz, mesh_degree, num_comp_x, &mesh_size, num_qpts, &mesh_restriction, NULL); BuildCartesianRestriction(ceed, dim, num_xyz, sol_degree, 1, &sol_size, num_qpts, &sol_restriction, &q_data_restriction); if (!test) { @@ -148,6 +154,7 @@ int main(int argc, const char *argv[]) { // Create a CeedVector with the mesh coordinates. CeedVector mesh_coords; + CeedVectorCreate(ceed, mesh_size, &mesh_coords); SetCartesianMeshCoords(dim, num_xyz, mesh_degree, mesh_coords); @@ -157,12 +164,14 @@ int main(int argc, const char *argv[]) { // Context data to be passed to the 'build_mass' QFunction. CeedQFunctionContext build_ctx; struct BuildContext build_ctx_data; + build_ctx_data.dim = build_ctx_data.space_dim = dim; CeedQFunctionContextCreate(ceed, &build_ctx); CeedQFunctionContextSetData(build_ctx, CEED_MEM_HOST, CEED_USE_POINTER, sizeof(build_ctx_data), &build_ctx_data); // Create the QFunction that builds the mass operator (i.e. computes its quadrature data) and set its context data. CeedQFunction qf_build; + if (gallery) { // This creates the QFunction via the gallery. char name[13] = ""; @@ -179,6 +188,7 @@ int main(int argc, const char *argv[]) { // Create the operator that builds the quadrature data for the mass operator. CeedOperator op_build; + CeedOperatorCreate(ceed, qf_build, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_build); CeedOperatorSetField(op_build, "dx", mesh_restriction, mesh_basis, CEED_VECTOR_ACTIVE); CeedOperatorSetField(op_build, "weights", CEED_ELEMRESTRICTION_NONE, mesh_basis, CEED_VECTOR_NONE); @@ -188,12 +198,14 @@ int main(int argc, const char *argv[]) { CeedVector q_data; CeedInt elem_qpts = CeedIntPow(num_qpts, dim); CeedInt num_elem = 1; + for (CeedInt d = 0; d < dim; d++) num_elem *= num_xyz[d]; CeedVectorCreate(ceed, num_elem * elem_qpts, &q_data); CeedOperatorApply(op_build, mesh_coords, q_data, CEED_REQUEST_IMMEDIATE); // Create the QFunction that defines the action of the mass operator. CeedQFunction qf_apply; + if (gallery) { // This creates the QFunction via the gallery. CeedQFunctionCreateInteriorByName(ceed, "MassApply", &qf_apply); @@ -207,6 +219,7 @@ int main(int argc, const char *argv[]) { // Create the mass operator. CeedOperator op_apply; + CeedOperatorCreate(ceed, qf_apply, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_apply); CeedOperatorSetField(op_apply, "u", sol_restriction, sol_basis, CEED_VECTOR_ACTIVE); CeedOperatorSetField(op_apply, "qdata", q_data_restriction, CEED_BASIS_NONE, q_data); @@ -214,6 +227,7 @@ int main(int argc, const char *argv[]) { // Create auxiliary solution-size vectors. CeedVector u, v; + CeedVectorCreate(ceed, sol_size, &u); CeedVectorCreate(ceed, sol_size, &v); @@ -223,10 +237,24 @@ int main(int argc, const char *argv[]) { // Compute the mesh volume using the mass operator: volume = 1^T \cdot M \cdot 1 CeedOperatorApply(op_apply, u, v, CEED_REQUEST_IMMEDIATE); + // Benchmark runs + if (!test && benchmark) { + // LCOV_EXCL_START + printf(" Executing %d benchmarking runs...\n", benchmark); + // LCOV_EXCL_STOP + } + for (CeedInt i = 0; i < benchmark; i++) { + // LCOV_EXCL_START + CeedOperatorApply(op_apply, u, v, CEED_REQUEST_IMMEDIATE); + // LCOV_EXCL_STOP + } + // Compute and print the sum of the entries of 'v' giving the mesh volume. CeedScalar volume = 0.; + { const CeedScalar *v_array; + CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array); for (CeedInt i = 0; i < sol_size; i++) volume += v_array[i]; CeedVectorRestoreArrayRead(v, &v_array); @@ -240,6 +268,7 @@ int main(int argc, const char *argv[]) { // LCOV_EXCL_STOP } else { CeedScalar tol = (dim == 1 ? 200. * CEED_EPSILON : dim == 2 ? 1E-5 : 1E-5); + if (fabs(volume - exact_volume) > tol) printf("Volume error : % .1e\n", volume - exact_volume); } @@ -267,13 +296,16 @@ int GetCartesianMeshSize(CeedInt dim, CeedInt degree, CeedInt prob_size, CeedInt // prob_size ~ num_elem * degree^dim CeedInt num_elem = prob_size / CeedIntPow(degree, dim); CeedInt s = 0; // find s: num_elem/2 < 2^s <= num_elem + while (num_elem > 1) { num_elem /= 2; s++; } CeedInt r = s % dim; + for (CeedInt d = 0; d < dim; d++) { CeedInt sd = s / dim; + if (r > 0) { sd++; r--; @@ -289,6 +321,7 @@ int BuildCartesianRestriction(Ceed ceed, CeedInt dim, CeedInt num_xyz[dim], Ceed CeedInt num_nodes = CeedIntPow(p, dim); // number of scalar nodes per element CeedInt elem_qpts = CeedIntPow(num_qpts, dim); // number of qpts per element CeedInt nd[3], num_elem = 1, scalar_size = 1; + for (CeedInt d = 0; d < dim; d++) { num_elem *= num_xyz[d]; nd[d] = num_xyz[d] * (p - 1) + 1; @@ -299,15 +332,19 @@ int BuildCartesianRestriction(Ceed ceed, CeedInt dim, CeedInt num_xyz[dim], Ceed // |---*-...-*---|---*-...-*---|- ... -|--...--| // num_nodes: 0 1 p-1 p p+1 2*p n*p CeedInt *elem_nodes = malloc(sizeof(CeedInt) * num_elem * num_nodes); + for (CeedInt e = 0; e < num_elem; e++) { CeedInt e_xyz[3] = {1, 1, 1}, re = e; + for (CeedInt d = 0; d < dim; d++) { e_xyz[d] = re % num_xyz[d]; re /= num_xyz[d]; } CeedInt *local_elem_nodes = elem_nodes + e * num_nodes; + for (CeedInt l_nodes = 0; l_nodes < num_nodes; l_nodes++) { CeedInt g_nodes = 0, g_nodes_stride = 1, r_nodes = l_nodes; + for (CeedInt d = 0; d < dim; d++) { g_nodes += (e_xyz[d] * (p - 1) + r_nodes % p) * g_nodes_stride; g_nodes_stride *= nd[d]; @@ -318,8 +355,9 @@ int BuildCartesianRestriction(Ceed ceed, CeedInt dim, CeedInt num_xyz[dim], Ceed } CeedElemRestrictionCreate(ceed, num_elem, num_nodes, num_comp, scalar_size, num_comp * scalar_size, CEED_MEM_HOST, CEED_COPY_VALUES, elem_nodes, restriction); - if (q_data_restriction) + if (q_data_restriction) { CeedElemRestrictionCreateStrided(ceed, num_elem, elem_qpts, num_comp, num_comp * elem_qpts * num_elem, CEED_STRIDES_BACKEND, q_data_restriction); + } free(elem_nodes); return 0; } @@ -327,20 +365,25 @@ int BuildCartesianRestriction(Ceed ceed, CeedInt dim, CeedInt num_xyz[dim], Ceed int SetCartesianMeshCoords(CeedInt dim, CeedInt num_xyz[dim], CeedInt mesh_degree, CeedVector mesh_coords) { CeedInt p = mesh_degree + 1; CeedInt nd[3], scalar_size = 1; + for (CeedInt d = 0; d < dim; d++) { nd[d] = num_xyz[d] * (p - 1) + 1; scalar_size *= nd[d]; } CeedScalar *coords; + CeedVectorGetArrayWrite(mesh_coords, CEED_MEM_HOST, &coords); CeedScalar *nodes = malloc(sizeof(CeedScalar) * p); + // The H1 basis uses Lobatto quadrature points as nodes. CeedLobattoQuadrature(p, nodes, NULL); // nodes are in [-1,1] for (CeedInt i = 0; i < p; i++) nodes[i] = 0.5 + 0.5 * nodes[i]; for (CeedInt gs_nodes = 0; gs_nodes < scalar_size; gs_nodes++) { CeedInt r_nodes = gs_nodes; + for (CeedInt d = 0; d < dim; d++) { - CeedInt d_1d = r_nodes % nd[d]; + CeedInt d_1d = r_nodes % nd[d]; + coords[gs_nodes + scalar_size * d] = ((d_1d / (p - 1)) + nodes[d_1d % (p - 1)]) / num_xyz[d]; r_nodes /= nd[d]; } @@ -358,6 +401,7 @@ int SetCartesianMeshCoords(CeedInt dim, CeedInt num_xyz[dim], CeedInt mesh_degre CeedScalar TransformMeshCoords(CeedInt dim, CeedInt mesh_size, CeedVector mesh_coords) { CeedScalar exact_volume; CeedScalar *coords; + CeedVectorGetArray(mesh_coords, CEED_MEM_HOST, &coords); if (dim == 1) { for (CeedInt i = 0; i < mesh_size; i++) { @@ -367,10 +411,12 @@ CeedScalar TransformMeshCoords(CeedInt dim, CeedInt mesh_size, CeedVector mesh_c exact_volume = 1.; } else { CeedInt num_nodes = mesh_size / dim; + for (CeedInt i = 0; i < num_nodes; i++) { // map (x,y) from [0,1]x[0,1] to the quarter annulus with polar // coordinates, (r,phi) in [1,2]x[0,pi/2] with area = 3/4*pi CeedScalar u = coords[i], v = coords[i + num_nodes]; + u = 1. + u; v = M_PI_2 * v; coords[i] = u * cos(v); diff --git a/examples/ceed/ex1-volume.h b/examples/ceed/ex1-volume.h index d78ea16c6f..581cff997e 100644 --- a/examples/ceed/ex1-volume.h +++ b/examples/ceed/ex1-volume.h @@ -1,11 +1,11 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause // // This file is part of CEED: http://github.com/ceed -#include +#include /// A structure used to pass additional data to f_build_mass struct BuildContext { @@ -14,47 +14,51 @@ struct BuildContext { /// libCEED Q-function for building quadrature data for a mass operator CEED_QFUNCTION(build_mass)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { - // in[0] is Jacobians with shape [dim, nc=dim, Q] - // in[1] is quadrature weights, size (Q) struct BuildContext *build_data = (struct BuildContext *)ctx; - const CeedScalar *J = in[0], *w = in[1]; - CeedScalar *q_data = out[0]; + + // in[0] is Jacobians with shape [dim, dim, Q] + // in[1] is quadrature weights with shape [1, Q] + const CeedScalar *w = in[1]; + CeedScalar *q_data = out[0]; switch (build_data->dim + 10 * build_data->space_dim) { - case 11: + case 11: { + const CeedScalar(*J)[1][CEED_Q_VLA] = (const CeedScalar(*)[1][CEED_Q_VLA])in[0]; + // Quadrature Point Loop - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { q_data[i] = J[i] * w[i]; } // End of Quadrature Point Loop - break; - case 22: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { q_data[i] = J[0][0][i] * w[i]; } // End of Quadrature Point Loop + } break; + case 22: { + const CeedScalar(*J)[2][CEED_Q_VLA] = (const CeedScalar(*)[2][CEED_Q_VLA])in[0]; + // Quadrature Point Loop CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { - // 0 2 - // 1 3 - q_data[i] = (J[i + Q * 0] * J[i + Q * 3] - J[i + Q * 1] * J[i + Q * 2]) * w[i]; + q_data[i] = (J[0][0][i] * J[1][1][i] - J[0][1][i] * J[1][0][i]) * w[i]; } // End of Quadrature Point Loop - break; - case 33: + } break; + case 33: { + const CeedScalar(*J)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[0]; + // Quadrature Point Loop CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { - // 0 3 6 - // 1 4 7 - // 2 5 8 - q_data[i] = (J[i + Q * 0] * (J[i + Q * 4] * J[i + Q * 8] - J[i + Q * 5] * J[i + Q * 7]) - - J[i + Q * 1] * (J[i + Q * 3] * J[i + Q * 8] - J[i + Q * 5] * J[i + Q * 6]) + - J[i + Q * 2] * (J[i + Q * 3] * J[i + Q * 7] - J[i + Q * 4] * J[i + Q * 6])) * - w[i]; + q_data[i] = + (J[0][0][i] * (J[1][1][i] * J[2][2][i] - J[1][2][i] * J[2][1][i]) - J[0][1][i] * (J[1][0][i] * J[2][2][i] - J[1][2][i] * J[2][0][i]) + + J[0][2][i] * (J[1][0][i] * J[2][1][i] - J[1][1][i] * J[2][0][i])) * + w[i]; } // End of Quadrature Point Loop - break; + } break; } - return 0; + return CEED_ERROR_SUCCESS; } /// libCEED Q-function for applying a mass operator CEED_QFUNCTION(apply_mass)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { + // in[0], out[0] are solution variables with shape [1, Q] + // in[1] is quadrature data with shape [1, Q] const CeedScalar *u = in[0], *q_data = in[1]; CeedScalar *v = out[0]; // Quadrature Point Loop CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { v[i] = q_data[i] * u[i]; } // End of Quadrature Point Loop - return 0; + return CEED_ERROR_SUCCESS; } diff --git a/examples/ceed/ex2-surface.c b/examples/ceed/ex2-surface.c index d536068800..2191e4dc63 100644 --- a/examples/ceed/ex2-surface.c +++ b/examples/ceed/ex2-surface.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -60,7 +60,7 @@ int main(int argc, const char *argv[]) { CeedInt sol_degree = 4; // polynomial degree for the solution CeedInt num_qpts = sol_degree + 2; // number of 1D quadrature points CeedInt prob_size = -1; // approximate problem size - CeedInt help = 0, test = 0, gallery = 0; + CeedInt help = 0, test = 0, gallery = 0, benchmark = 0; // Process command line arguments. for (int ia = 1; ia < argc; ia++) { @@ -81,6 +81,8 @@ int main(int argc, const char *argv[]) { parse_error = next_arg ? num_qpts = atoi(argv[++ia]), 0 : 1; } else if (!strcmp(argv[ia], "-s")) { parse_error = next_arg ? prob_size = atoi(argv[++ia]), 0 : 1; + } else if (!strcmp(argv[ia], "-b")) { + parse_error = next_arg ? benchmark = atoi(argv[++ia]), 0 : 1; } else if (!strcmp(argv[ia], "-t")) { test = 1; } else if (!strcmp(argv[ia], "-g")) { @@ -119,15 +121,18 @@ int main(int argc, const char *argv[]) { // Select appropriate backend and logical device based on the (-ceed) command line argument. Ceed ceed; + CeedInit(ceed_spec, &ceed); // Construct the mesh and solution bases. CeedBasis mesh_basis, sol_basis; + CeedBasisCreateTensorH1Lagrange(ceed, dim, num_comp_x, mesh_degree + 1, num_qpts, CEED_GAUSS, &mesh_basis); CeedBasisCreateTensorH1Lagrange(ceed, dim, 1, sol_degree + 1, num_qpts, CEED_GAUSS, &sol_basis); // Determine the mesh size based on the given approximate problem size. CeedInt num_xyz[3]; + GetCartesianMeshSize(dim, sol_degree, prob_size, num_xyz); if (!test) { @@ -142,6 +147,7 @@ int main(int argc, const char *argv[]) { // Build CeedElemRestriction objects describing the mesh and solution discrete representations. CeedInt mesh_size, sol_size; CeedElemRestriction mesh_restriction, sol_restriction, q_data_restriction; + BuildCartesianRestriction(ceed, dim, num_xyz, mesh_degree, num_comp_x, &mesh_size, num_qpts, &mesh_restriction, NULL); BuildCartesianRestriction(ceed, dim, num_xyz, sol_degree, dim * (dim + 1) / 2, &sol_size, num_qpts, NULL, &q_data_restriction); BuildCartesianRestriction(ceed, dim, num_xyz, sol_degree, 1, &sol_size, num_qpts, &sol_restriction, NULL); @@ -154,6 +160,7 @@ int main(int argc, const char *argv[]) { // Create a CeedVector with the mesh coordinates. CeedVector mesh_coords; + CeedVectorCreate(ceed, mesh_size, &mesh_coords); SetCartesianMeshCoords(dim, num_xyz, mesh_degree, mesh_coords); @@ -163,12 +170,14 @@ int main(int argc, const char *argv[]) { // Context data to be passed to the 'build_diff' QFunction. CeedQFunctionContext build_ctx; struct BuildContext build_ctx_data; + build_ctx_data.dim = build_ctx_data.space_dim = dim; CeedQFunctionContextCreate(ceed, &build_ctx); CeedQFunctionContextSetData(build_ctx, CEED_MEM_HOST, CEED_USE_POINTER, sizeof(build_ctx_data), &build_ctx_data); // Create the QFunction that builds the diffusion operator (i.e. computes its quadrature data) and set its context data. CeedQFunction qf_build; + if (gallery) { // This creates the QFunction via the gallery. char name[16] = ""; @@ -185,6 +194,7 @@ int main(int argc, const char *argv[]) { // Create the operator that builds the quadrature data for the diffusion operator. CeedOperator op_build; + CeedOperatorCreate(ceed, qf_build, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_build); CeedOperatorSetField(op_build, "dx", mesh_restriction, mesh_basis, CEED_VECTOR_ACTIVE); CeedOperatorSetField(op_build, "weights", CEED_ELEMRESTRICTION_NONE, mesh_basis, CEED_VECTOR_NONE); @@ -194,15 +204,17 @@ int main(int argc, const char *argv[]) { CeedVector q_data; CeedInt elem_qpts = CeedIntPow(num_qpts, dim); CeedInt num_elem = 1; + for (CeedInt d = 0; d < dim; d++) num_elem *= num_xyz[d]; CeedVectorCreate(ceed, num_elem * elem_qpts * dim * (dim + 1) / 2, &q_data); CeedOperatorApply(op_build, mesh_coords, q_data, CEED_REQUEST_IMMEDIATE); // Create the QFunction that defines the action of the diffusion operator. CeedQFunction qf_apply; + if (gallery) { // This creates the QFunction via the gallery. - char name[16] = ""; + char name[25] = ""; snprintf(name, sizeof name, "Poisson%" CeedInt_FMT "DApply", dim); CeedQFunctionCreateInteriorByName(ceed, name, &qf_apply); } else { @@ -216,6 +228,7 @@ int main(int argc, const char *argv[]) { // Create the diffusion operator. CeedOperator op_apply; + CeedOperatorCreate(ceed, qf_apply, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_apply); CeedOperatorSetField(op_apply, "du", sol_restriction, sol_basis, CEED_VECTOR_ACTIVE); CeedOperatorSetField(op_apply, "qdata", q_data_restriction, CEED_BASIS_NONE, q_data); @@ -223,6 +236,7 @@ int main(int argc, const char *argv[]) { // Create auxiliary solution-size vectors. CeedVector u, v; + CeedVectorCreate(ceed, sol_size, &u); CeedVectorCreate(ceed, sol_size, &v); @@ -230,6 +244,7 @@ int main(int argc, const char *argv[]) { { CeedScalar *u_array; const CeedScalar *x_array; + CeedVectorGetArrayWrite(u, CEED_MEM_HOST, &u_array); CeedVectorGetArrayRead(mesh_coords, CEED_MEM_HOST, &x_array); for (CeedInt i = 0; i < sol_size; i++) { @@ -243,10 +258,23 @@ int main(int argc, const char *argv[]) { // Compute the mesh surface area using the diff operator: surface_area = 1^T \cdot abs( K \cdot x). CeedOperatorApply(op_apply, u, v, CEED_REQUEST_IMMEDIATE); + // Benchmark runs + if (!test && benchmark) { + // LCOV_EXCL_START + printf(" Executing %d benchmarking runs...\n", benchmark); + // LCOV_EXCL_STOP + } + for (CeedInt i = 0; i < benchmark; i++) { + // LCOV_EXCL_START + CeedOperatorApply(op_apply, u, v, CEED_REQUEST_IMMEDIATE); + // LCOV_EXCL_STOP + } + // Compute and print the sum of the entries of 'v' giving the mesh surface area. CeedScalar surface_area = 0.; { const CeedScalar *v_array; + CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array); for (CeedInt i = 0; i < sol_size; i++) surface_area += fabs(v_array[i]); CeedVectorRestoreArrayRead(v, &v_array); @@ -260,6 +288,7 @@ int main(int argc, const char *argv[]) { // LCOV_EXCL_STOP } else { CeedScalar tol = (dim == 1 ? 10000. * CEED_EPSILON : dim == 2 ? 1E-1 : 1E-1); + if (fabs(surface_area - exact_surface_area) > tol) printf("Surface area error : % .14g\n", surface_area - exact_surface_area); } @@ -287,13 +316,16 @@ int GetCartesianMeshSize(CeedInt dim, CeedInt degree, CeedInt prob_size, CeedInt // prob_size ~ num_elem * degree^dim CeedInt num_elem = prob_size / CeedIntPow(degree, dim); CeedInt s = 0; // find s: num_elem/2 < 2^s <= num_elem + while (num_elem > 1) { num_elem /= 2; s++; } CeedInt r = s % dim; + for (CeedInt d = 0; d < dim; d++) { CeedInt sd = s / dim; + if (r > 0) { sd++; r--; @@ -309,6 +341,7 @@ int BuildCartesianRestriction(Ceed ceed, CeedInt dim, CeedInt num_xyz[3], CeedIn CeedInt num_nodes = CeedIntPow(p, dim); // number of scalar nodes per element CeedInt elem_qpts = CeedIntPow(num_qpts, dim); // number of qpts per element CeedInt nd[3], num_elem = 1, scalar_size = 1; + for (CeedInt d = 0; d < dim; d++) { num_elem *= num_xyz[d]; nd[d] = num_xyz[d] * (p - 1) + 1; @@ -319,15 +352,19 @@ int BuildCartesianRestriction(Ceed ceed, CeedInt dim, CeedInt num_xyz[3], CeedIn // |---*-...-*---|---*-...-*---|- ... -|--...--| // num_nodes: 0 1 p-1 p p+1 2*p n*p CeedInt *el_nodes = malloc(sizeof(CeedInt) * num_elem * num_nodes); + for (CeedInt e = 0; e < num_elem; e++) { CeedInt e_xyz[3] = {1, 1, 1}, re = e; + for (CeedInt d = 0; d < dim; d++) { e_xyz[d] = re % num_xyz[d]; re /= num_xyz[d]; } CeedInt *local_elem_nodes = el_nodes + e * num_nodes; + for (CeedInt l_nodes = 0; l_nodes < num_nodes; l_nodes++) { CeedInt g_nodes = 0, g_nodes_stride = 1, r_nodes = l_nodes; + for (CeedInt d = 0; d < dim; d++) { g_nodes += (e_xyz[d] * (p - 1) + r_nodes % p) * g_nodes_stride; g_nodes_stride *= nd[d]; @@ -336,9 +373,10 @@ int BuildCartesianRestriction(Ceed ceed, CeedInt dim, CeedInt num_xyz[3], CeedIn local_elem_nodes[l_nodes] = g_nodes; } } - if (restriction) + if (restriction) { CeedElemRestrictionCreate(ceed, num_elem, num_nodes, num_comp, scalar_size, num_comp * scalar_size, CEED_MEM_HOST, CEED_COPY_VALUES, el_nodes, restriction); + } free(el_nodes); if (q_data_restriction) { @@ -351,20 +389,25 @@ int BuildCartesianRestriction(Ceed ceed, CeedInt dim, CeedInt num_xyz[3], CeedIn int SetCartesianMeshCoords(CeedInt dim, CeedInt num_xyz[3], CeedInt mesh_degree, CeedVector mesh_coords) { CeedInt p = mesh_degree + 1; CeedInt nd[3], scalar_size = 1; + for (CeedInt d = 0; d < dim; d++) { nd[d] = num_xyz[d] * (p - 1) + 1; scalar_size *= nd[d]; } CeedScalar *coords; + CeedVectorGetArrayWrite(mesh_coords, CEED_MEM_HOST, &coords); CeedScalar *nodes = malloc(sizeof(CeedScalar) * p); + // The H1 basis uses Lobatto quadrature points as nodes. CeedLobattoQuadrature(p, nodes, NULL); // nodes are in [-1,1] for (CeedInt i = 0; i < p; i++) nodes[i] = 0.5 + 0.5 * nodes[i]; for (CeedInt gs_nodes = 0; gs_nodes < scalar_size; gs_nodes++) { CeedInt r_nodes = gs_nodes; + for (CeedInt d = 0; d < dim; d++) { - CeedInt d1d = r_nodes % nd[d]; + CeedInt d1d = r_nodes % nd[d]; + coords[gs_nodes + scalar_size * d] = ((d1d / (p - 1)) + nodes[d1d % (p - 1)]) / num_xyz[d]; r_nodes /= nd[d]; } @@ -388,6 +431,5 @@ CeedScalar TransformMeshCoords(CeedInt dim, CeedInt mesh_size, CeedVector mesh_c coords[i] = 0.5 + 1. / sqrt(3.) * sin((2. / 3.) * M_PI * (coords[i] - 0.5)); } CeedVectorRestoreArray(mesh_coords, &coords); - return exact_surface_area; } diff --git a/examples/ceed/ex2-surface.h b/examples/ceed/ex2-surface.h index 4258a1e944..c8aa53b29b 100644 --- a/examples/ceed/ex2-surface.h +++ b/examples/ceed/ex2-surface.h @@ -1,11 +1,11 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause // // This file is part of CEED: http://github.com/ceed -#include +#include /// A structure used to pass additional data to f_build_diff struct BuildContext { @@ -15,110 +15,126 @@ struct BuildContext { /// libCEED Q-function for building quadrature data for a diffusion operator CEED_QFUNCTION(build_diff)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { struct BuildContext *build_data = (struct BuildContext *)ctx; - // in[0] is Jacobians with shape [dim, nc=dim, Q] + + // in[0] is Jacobians with shape [dim, dim, Q] // in[1] is quadrature weights, size (Q) - // + const CeedScalar *w = in[1]; + CeedScalar(*q_data)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[0]; + // At every quadrature point, compute w/det(J).adj(J).adj(J)^T and store // the symmetric part of the result. - const CeedScalar *J = in[0], *w = in[1]; - CeedScalar *q_data = out[0]; - switch (build_data->dim + 10 * build_data->space_dim) { - case 11: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { q_data[i] = w[i] / J[i]; } // End of Quadrature Point Loop - break; - case 22: + case 11: { + const CeedScalar(*J)[1][CEED_Q_VLA] = (const CeedScalar(*)[1][CEED_Q_VLA])in[0]; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { q_data[0][i] = w[i] / J[0][0][i]; } // End of Quadrature Point Loop + } break; + case 22: { + const CeedScalar(*J)[2][CEED_Q_VLA] = (const CeedScalar(*)[2][CEED_Q_VLA])in[0]; + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { - // J: 0 2 q_data: 0 2 adj(J): J22 -J12 - // 1 3 2 1 -J21 J11 - const CeedScalar J11 = J[i + Q * 0]; - const CeedScalar J21 = J[i + Q * 1]; - const CeedScalar J12 = J[i + Q * 2]; - const CeedScalar J22 = J[i + Q * 3]; - const CeedScalar qw = w[i] / (J11 * J22 - J21 * J12); - q_data[i + Q * 0] = qw * (J12 * J12 + J22 * J22); - q_data[i + Q * 1] = qw * (J11 * J11 + J21 * J21); - q_data[i + Q * 2] = -qw * (J11 * J12 + J21 * J22); + // J: 0 2 q_data: 0 2 adj(J): J11 -J01 + // 1 3 2 1 -J10 J00 + const CeedScalar J00 = J[0][0][i]; + const CeedScalar J10 = J[0][1][i]; + const CeedScalar J01 = J[1][0][i]; + const CeedScalar J11 = J[1][1][i]; + const CeedScalar qw = w[i] / (J00 * J11 - J10 * J01); + + q_data[0][i] = qw * (J01 * J01 + J11 * J11); + q_data[1][i] = qw * (J00 * J00 + J10 * J10); + q_data[2][i] = -qw * (J00 * J01 + J10 * J11); } // End of Quadrature Point Loop - break; - case 33: + } break; + case 33: { + const CeedScalar(*J)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[0]; + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { // Compute the adjoint CeedScalar A[3][3]; - for (CeedInt j = 0; j < 3; j++) - for (CeedInt k = 0; k < 3; k++) + + for (CeedInt j = 0; j < 3; j++) { + for (CeedInt k = 0; k < 3; k++) { // Equivalent code with J as a VLA and no mod operations: // A[k][j] = J[j+1][k+1]*J[j+2][k+2] - J[j+1][k+2]*J[j+2][k+1] - A[k][j] = J[i + Q * ((j + 1) % 3 + 3 * ((k + 1) % 3))] * J[i + Q * ((j + 2) % 3 + 3 * ((k + 2) % 3))] - - J[i + Q * ((j + 1) % 3 + 3 * ((k + 2) % 3))] * J[i + Q * ((j + 2) % 3 + 3 * ((k + 1) % 3))]; + A[k][j] = + J[(k + 1) % 3][(j + 1) % 3][i] * J[(k + 2) % 3][(j + 2) % 3][i] - J[(k + 2) % 3][(j + 1) % 3][i] * J[(k + 1) % 3][(j + 2) % 3][i]; + } + } // Compute quadrature weight / det(J) - const CeedScalar qw = w[i] / (J[i + Q * 0] * A[0][0] + J[i + Q * 1] * A[0][1] + J[i + Q * 2] * A[0][2]); + const CeedScalar qw = w[i] / (J[0][0][i] * A[0][0] + J[0][1][i] * A[0][1] + J[0][2][i] * A[0][2]); // Compute geometric factors // Stored in Voigt convention // 0 5 4 // 5 1 3 // 4 3 2 - q_data[i + Q * 0] = qw * (A[0][0] * A[0][0] + A[0][1] * A[0][1] + A[0][2] * A[0][2]); - q_data[i + Q * 1] = qw * (A[1][0] * A[1][0] + A[1][1] * A[1][1] + A[1][2] * A[1][2]); - q_data[i + Q * 2] = qw * (A[2][0] * A[2][0] + A[2][1] * A[2][1] + A[2][2] * A[2][2]); - q_data[i + Q * 3] = qw * (A[1][0] * A[2][0] + A[1][1] * A[2][1] + A[1][2] * A[2][2]); - q_data[i + Q * 4] = qw * (A[0][0] * A[2][0] + A[0][1] * A[2][1] + A[0][2] * A[2][2]); - q_data[i + Q * 5] = qw * (A[0][0] * A[1][0] + A[0][1] * A[1][1] + A[0][2] * A[1][2]); + q_data[0][i] = qw * (A[0][0] * A[0][0] + A[0][1] * A[0][1] + A[0][2] * A[0][2]); + q_data[1][i] = qw * (A[1][0] * A[1][0] + A[1][1] * A[1][1] + A[1][2] * A[1][2]); + q_data[2][i] = qw * (A[2][0] * A[2][0] + A[2][1] * A[2][1] + A[2][2] * A[2][2]); + q_data[3][i] = qw * (A[1][0] * A[2][0] + A[1][1] * A[2][1] + A[1][2] * A[2][2]); + q_data[4][i] = qw * (A[0][0] * A[2][0] + A[0][1] * A[2][1] + A[0][2] * A[2][2]); + q_data[5][i] = qw * (A[0][0] * A[1][0] + A[0][1] * A[1][1] + A[0][2] * A[1][2]); } // End of Quadrature Point Loop - break; + } break; } - return 0; + return CEED_ERROR_SUCCESS; } /// libCEED Q-function for applying a diff operator CEED_QFUNCTION(apply_diff)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { struct BuildContext *build_data = (struct BuildContext *)ctx; - // in[0], out[0] have shape [dim, nc=1, Q] - const CeedScalar *ug = in[0], *q_data = in[1]; - CeedScalar *vg = out[0]; + + // in[0], out[0] solution gradients with shape [dim, 1, Q] + // in[1] is quadrature data with shape [num_components, Q] + const CeedScalar(*q_data)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[1]; switch (build_data->dim) { - case 1: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { vg[i] = ug[i] * q_data[i]; } // End of Quadrature Point Loop - break; - case 2: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { - // Read spatial derivatives of u - const CeedScalar du[2] = {ug[i + Q * 0], ug[i + Q * 1]}; + case 1: { + const CeedScalar *ug = in[0]; + CeedScalar *vg = out[0]; + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { vg[i] = ug[i] * q_data[0][i]; } // End of Quadrature Point Loop + } break; + case 2: { + const CeedScalar(*ug)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[0]; + CeedScalar(*vg)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[0]; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { // Read q_data (dXdxdXdx_T symmetric matrix) // Stored in Voigt convention // 0 2 // 2 1 const CeedScalar dXdxdXdx_T[2][2] = { - {q_data[i + 0 * Q], q_data[i + 2 * Q]}, - {q_data[i + 2 * Q], q_data[i + 1 * Q]} + {q_data[0][i], q_data[2][i]}, + {q_data[2][i], q_data[1][i]} }; + // j = direction of vg - for (int j = 0; j < 2; j++) vg[i + j * Q] = (du[0] * dXdxdXdx_T[0][j] + du[1] * dXdxdXdx_T[1][j]); + for (int j = 0; j < 2; j++) vg[j][i] = (ug[0][i] * dXdxdXdx_T[0][j] + ug[1][i] * dXdxdXdx_T[1][j]); } // End of Quadrature Point Loop - break; - case 3: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { - // Read spatial derivatives of u - const CeedScalar du[3] = {ug[i + Q * 0], ug[i + Q * 1], ug[i + Q * 2]}; + } break; + case 3: { + const CeedScalar(*ug)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[0]; + CeedScalar(*vg)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[0]; + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { // Read q_data (dXdxdXdx_T symmetric matrix) // Stored in Voigt convention // 0 5 4 // 5 1 3 // 4 3 2 const CeedScalar dXdxdXdx_T[3][3] = { - {q_data[i + 0 * Q], q_data[i + 5 * Q], q_data[i + 4 * Q]}, - {q_data[i + 5 * Q], q_data[i + 1 * Q], q_data[i + 3 * Q]}, - {q_data[i + 4 * Q], q_data[i + 3 * Q], q_data[i + 2 * Q]} + {q_data[0][i], q_data[5][i], q_data[4][i]}, + {q_data[5][i], q_data[1][i], q_data[3][i]}, + {q_data[4][i], q_data[3][i], q_data[2][i]} }; + // j = direction of vg - for (int j = 0; j < 3; j++) vg[i + j * Q] = (du[0] * dXdxdXdx_T[0][j] + du[1] * dXdxdXdx_T[1][j] + du[2] * dXdxdXdx_T[2][j]); + for (int j = 0; j < 3; j++) vg[j][i] = (ug[0][i] * dXdxdXdx_T[0][j] + ug[1][i] * dXdxdXdx_T[1][j] + ug[2][i] * dXdxdXdx_T[2][j]); } // End of Quadrature Point Loop - break; + } break; } - return 0; + return CEED_ERROR_SUCCESS; } diff --git a/examples/ceed/ex3-volume.c b/examples/ceed/ex3-volume.c new file mode 100644 index 0000000000..380882a631 --- /dev/null +++ b/examples/ceed/ex3-volume.c @@ -0,0 +1,418 @@ +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. +// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +// +// SPDX-License-Identifier: BSD-2-Clause +// +// This file is part of CEED: http://github.com/ceed + +// libCEED Example 1 +// +// This example illustrates a simple usage of libCEED to compute the volume of a 3D body using matrix-free application of a mass operator. +// This example also uses a diffusion operator, which provides zero contribution to the computed volume but demonstrates libCEED's ability +// to handle multiple basis evaluation modes for the same input and output vectors. +// Arbitrary mesh and solution degrees in 1D, 2D and 3D are supported from the same code. +// +// The example has no dependencies, and is designed to be self-contained. +// For additional examples that use external discretization libraries (MFEM, PETSc, etc.) see the subdirectories in libceed/examples. +// +// All libCEED objects use a Ceed device object constructed based on a command line argument (-ceed). +// +// Build with: +// +// make ex3-volume [CEED_DIR=] +// +// Sample runs: +// +// ./ex3-volume +// ./ex3-volume -ceed /cpu/self +// ./ex3-volume -ceed /gpu/cuda +// +// Test in 1D-3D +//TESTARGS(name="1D User QFunction") -ceed {ceed_resource} -d 1 -t +//TESTARGS(name="2D User QFunction") -ceed {ceed_resource} -d 2 -t +//TESTARGS(name="3D User QFunction") -ceed {ceed_resource} -d 3 -t + +/// @file +/// libCEED example using mass operator to compute volume + +#include "ex3-volume.h" + +#include +#include +#include +#include +#include + +// Auxiliary functions +int GetCartesianMeshSize(CeedInt dim, CeedInt degree, CeedInt prob_size, CeedInt num_xyz[dim]); +int BuildCartesianRestriction(Ceed ceed, CeedInt dim, CeedInt num_xyz[dim], CeedInt degree, CeedInt num_comp, CeedInt *size, CeedInt num_qpts, + CeedElemRestriction *restriction, CeedElemRestriction *q_data_restriction); +int SetCartesianMeshCoords(CeedInt dim, CeedInt num_xyz[dim], CeedInt mesh_degree, CeedVector mesh_coords); +CeedScalar TransformMeshCoords(CeedInt dim, CeedInt mesh_size, CeedVector mesh_coords); + +// Main example +int main(int argc, const char *argv[]) { + const char *ceed_spec = "/cpu/self"; + CeedInt dim = 3; // dimension of the mesh + CeedInt num_comp_x = 3; // number of x components + CeedInt mesh_degree = 4; // polynomial degree for the mesh + CeedInt sol_degree = 4; // polynomial degree for the solution + CeedInt num_qpts = sol_degree + 2; // number of 1D quadrature points + CeedInt prob_size = -1; // approximate problem size + CeedInt help = 0, test = 0, benchmark = 0; + + // Process command line arguments. + for (int ia = 1; ia < argc; ia++) { + // LCOV_EXCL_START + int next_arg = ((ia + 1) < argc), parse_error = 0; + if (!strcmp(argv[ia], "-h")) { + help = 1; + } else if (!strcmp(argv[ia], "-c") || !strcmp(argv[ia], "-ceed")) { + parse_error = next_arg ? ceed_spec = argv[++ia], 0 : 1; + } else if (!strcmp(argv[ia], "-d")) { + parse_error = next_arg ? dim = atoi(argv[++ia]), 0 : 1; + num_comp_x = dim; + } else if (!strcmp(argv[ia], "-m")) { + parse_error = next_arg ? mesh_degree = atoi(argv[++ia]), 0 : 1; + } else if (!strcmp(argv[ia], "-p")) { + parse_error = next_arg ? sol_degree = atoi(argv[++ia]), 0 : 1; + } else if (!strcmp(argv[ia], "-q")) { + parse_error = next_arg ? num_qpts = atoi(argv[++ia]), 0 : 1; + } else if (!strcmp(argv[ia], "-s")) { + parse_error = next_arg ? prob_size = atoi(argv[++ia]), 0 : 1; + } else if (!strcmp(argv[ia], "-b")) { + parse_error = next_arg ? benchmark = atoi(argv[++ia]), 0 : 1; + } else if (!strcmp(argv[ia], "-t")) { + test = 1; + } + if (parse_error) { + printf("Error parsing command line options.\n"); + return 1; + } + // LCOV_EXCL_STOP + } + if (prob_size < 0) prob_size = test ? 8 * 16 : 256 * 1024; + + // Print the values of all options: + if (!test || help) { + // LCOV_EXCL_START + printf("Selected options: [command line option] : \n"); + printf(" Ceed specification [-c] : %s\n", ceed_spec); + printf(" Mesh dimension [-d] : %" CeedInt_FMT "\n", dim); + printf(" Mesh degree [-m] : %" CeedInt_FMT "\n", mesh_degree); + printf(" Solution degree [-p] : %" CeedInt_FMT "\n", sol_degree); + printf(" Num. 1D quadrature pts [-q] : %" CeedInt_FMT "\n", num_qpts); + printf(" Approx. # unknowns [-s] : %" CeedInt_FMT "\n", prob_size); + printf(" QFunction source : header"); + if (help) { + printf("Test/quiet mode is %s\n", (test ? "ON" : "OFF (use -t to enable)")); + return 0; + } + printf("\n"); + // LCOV_EXCL_STOP + } + + // Select appropriate backend and logical device based on the (-ceed) command line argument. + Ceed ceed; + + CeedInit(ceed_spec, &ceed); + + // Construct the mesh and solution bases. + CeedBasis mesh_basis, sol_basis; + + CeedBasisCreateTensorH1Lagrange(ceed, dim, num_comp_x, mesh_degree + 1, num_qpts, CEED_GAUSS, &mesh_basis); + CeedBasisCreateTensorH1Lagrange(ceed, dim, 1, sol_degree + 1, num_qpts, CEED_GAUSS, &sol_basis); + + // Determine the mesh size based on the given approximate problem size. + CeedInt num_xyz[dim]; + + GetCartesianMeshSize(dim, sol_degree, prob_size, num_xyz); + if (!test) { + // LCOV_EXCL_START + printf("Mesh size: nx = %" CeedInt_FMT, num_xyz[0]); + if (dim > 1) printf(", ny = %" CeedInt_FMT, num_xyz[1]); + if (dim > 2) printf(", nz = %" CeedInt_FMT, num_xyz[2]); + printf("\n"); + // LCOV_EXCL_STOP + } + + // Build CeedElemRestriction objects describing the mesh and solution discrete representations. + CeedInt mesh_size, sol_size; + CeedElemRestriction mesh_restriction, sol_restriction, q_data_restriction; + + BuildCartesianRestriction(ceed, dim, num_xyz, mesh_degree, num_comp_x, &mesh_size, num_qpts, &mesh_restriction, NULL); + BuildCartesianRestriction(ceed, dim, num_xyz, sol_degree, 1 + dim * (dim + 1) / 2, &sol_size, num_qpts, NULL, &q_data_restriction); + BuildCartesianRestriction(ceed, dim, num_xyz, sol_degree, 1, &sol_size, num_qpts, &sol_restriction, NULL); + if (!test) { + // LCOV_EXCL_START + printf("Number of mesh nodes : %" CeedInt_FMT "\n", mesh_size / dim); + printf("Number of solution nodes : %" CeedInt_FMT "\n", sol_size); + // LCOV_EXCL_STOP + } + + // Create a CeedVector with the mesh coordinates. + CeedVector mesh_coords; + + CeedVectorCreate(ceed, mesh_size, &mesh_coords); + SetCartesianMeshCoords(dim, num_xyz, mesh_degree, mesh_coords); + + // Apply a transformation to the mesh. + CeedScalar exact_volume = TransformMeshCoords(dim, mesh_size, mesh_coords); + + // Context data to be passed to the 'build_mass_diff' QFunction. + CeedQFunctionContext build_ctx; + struct BuildContext build_ctx_data; + + build_ctx_data.dim = build_ctx_data.space_dim = dim; + CeedQFunctionContextCreate(ceed, &build_ctx); + CeedQFunctionContextSetData(build_ctx, CEED_MEM_HOST, CEED_USE_POINTER, sizeof(build_ctx_data), &build_ctx_data); + + // Create the QFunction that builds the mass + diffusion operator (i.e. computes its quadrature data) and set its context data. + CeedQFunction qf_build; + + CeedQFunctionCreateInterior(ceed, 1, build_mass_diff, build_mass_diff_loc, &qf_build); + CeedQFunctionAddInput(qf_build, "dx", num_comp_x * dim, CEED_EVAL_GRAD); + CeedQFunctionAddInput(qf_build, "weights", 1, CEED_EVAL_WEIGHT); + CeedQFunctionAddOutput(qf_build, "qdata", 1 + dim * (dim + 1) / 2, CEED_EVAL_NONE); + CeedQFunctionSetContext(qf_build, build_ctx); + + // Create the operator that builds the quadrature data for the mass + diffusion operator. + CeedOperator op_build; + + CeedOperatorCreate(ceed, qf_build, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_build); + CeedOperatorSetField(op_build, "dx", mesh_restriction, mesh_basis, CEED_VECTOR_ACTIVE); + CeedOperatorSetField(op_build, "weights", CEED_ELEMRESTRICTION_NONE, mesh_basis, CEED_VECTOR_NONE); + CeedOperatorSetField(op_build, "qdata", q_data_restriction, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE); + + // Compute the quadrature data for the mass + diffusion operator. + CeedVector q_data; + CeedInt elem_qpts = CeedIntPow(num_qpts, dim); + CeedInt num_elem = 1; + + for (CeedInt d = 0; d < dim; d++) num_elem *= num_xyz[d]; + CeedVectorCreate(ceed, num_elem * elem_qpts * (1 + dim * (dim + 1) / 2), &q_data); + CeedOperatorApply(op_build, mesh_coords, q_data, CEED_REQUEST_IMMEDIATE); + + // Create the QFunction that defines the action of the mass + diffusion operator. + CeedQFunction qf_apply; + + CeedQFunctionCreateInterior(ceed, 1, apply_mass_diff, apply_mass_diff_loc, &qf_apply); + CeedQFunctionAddInput(qf_apply, "u", 1, CEED_EVAL_INTERP); + CeedQFunctionAddInput(qf_apply, "du", dim, CEED_EVAL_GRAD); + CeedQFunctionAddInput(qf_apply, "qdata", 1 + dim * (dim + 1) / 2, CEED_EVAL_NONE); + CeedQFunctionAddOutput(qf_apply, "v", 1, CEED_EVAL_INTERP); + CeedQFunctionAddOutput(qf_apply, "dv", dim, CEED_EVAL_GRAD); + CeedQFunctionSetContext(qf_apply, build_ctx); + + // Create the mass + diffusion operator. + CeedOperator op_apply; + + CeedOperatorCreate(ceed, qf_apply, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_apply); + CeedOperatorSetField(op_apply, "u", sol_restriction, sol_basis, CEED_VECTOR_ACTIVE); + CeedOperatorSetField(op_apply, "du", sol_restriction, sol_basis, CEED_VECTOR_ACTIVE); + CeedOperatorSetField(op_apply, "qdata", q_data_restriction, CEED_BASIS_NONE, q_data); + CeedOperatorSetField(op_apply, "v", sol_restriction, sol_basis, CEED_VECTOR_ACTIVE); + CeedOperatorSetField(op_apply, "dv", sol_restriction, sol_basis, CEED_VECTOR_ACTIVE); + + // Create auxiliary solution-size vectors. + CeedVector u, v; + + CeedVectorCreate(ceed, sol_size, &u); + CeedVectorCreate(ceed, sol_size, &v); + + // Initialize 'u' with ones. + CeedVectorSetValue(u, 1.0); + + // Compute the mesh volume using the mass + diffusion operator: volume = 1^T \cdot M \cdot 1 + CeedOperatorApply(op_apply, u, v, CEED_REQUEST_IMMEDIATE); + + // Benchmark runs + if (!test && benchmark) { + // LCOV_EXCL_START + printf(" Executing %d benchmarking runs...\n", benchmark); + // LCOV_EXCL_STOP + } + for (CeedInt i = 0; i < benchmark; i++) { + // LCOV_EXCL_START + CeedOperatorApply(op_apply, u, v, CEED_REQUEST_IMMEDIATE); + // LCOV_EXCL_STOP + } + + // Compute and print the sum of the entries of 'v' giving the mesh volume. + CeedScalar volume = 0.; + + { + const CeedScalar *v_array; + + CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array); + for (CeedInt i = 0; i < sol_size; i++) volume += v_array[i]; + CeedVectorRestoreArrayRead(v, &v_array); + } + if (!test) { + // LCOV_EXCL_START + printf(" done.\n"); + printf("Exact mesh volume : % .14g\n", exact_volume); + printf("Computed mesh volume : % .14g\n", volume); + printf("Volume error : % .14g\n", volume - exact_volume); + // LCOV_EXCL_STOP + } else { + CeedScalar tol = (dim == 1 ? 200. * CEED_EPSILON : dim == 2 ? 1E-5 : 1E-5); + + if (fabs(volume - exact_volume) > tol) printf("Volume error : % .1e\n", volume - exact_volume); + } + + // Free dynamically allocated memory. + CeedVectorDestroy(&u); + CeedVectorDestroy(&v); + CeedVectorDestroy(&q_data); + CeedVectorDestroy(&mesh_coords); + CeedOperatorDestroy(&op_apply); + CeedQFunctionDestroy(&qf_apply); + CeedQFunctionContextDestroy(&build_ctx); + CeedOperatorDestroy(&op_build); + CeedQFunctionDestroy(&qf_build); + CeedElemRestrictionDestroy(&sol_restriction); + CeedElemRestrictionDestroy(&mesh_restriction); + CeedElemRestrictionDestroy(&q_data_restriction); + CeedBasisDestroy(&sol_basis); + CeedBasisDestroy(&mesh_basis); + CeedDestroy(&ceed); + return 0; +} + +int GetCartesianMeshSize(CeedInt dim, CeedInt degree, CeedInt prob_size, CeedInt num_xyz[dim]) { + // Use the approximate formula: + // prob_size ~ num_elem * degree^dim + CeedInt num_elem = prob_size / CeedIntPow(degree, dim); + CeedInt s = 0; // find s: num_elem/2 < 2^s <= num_elem + + while (num_elem > 1) { + num_elem /= 2; + s++; + } + CeedInt r = s % dim; + + for (CeedInt d = 0; d < dim; d++) { + CeedInt sd = s / dim; + + if (r > 0) { + sd++; + r--; + } + num_xyz[d] = 1 << sd; + } + return 0; +} + +int BuildCartesianRestriction(Ceed ceed, CeedInt dim, CeedInt num_xyz[dim], CeedInt degree, CeedInt num_comp, CeedInt *size, CeedInt num_qpts, + CeedElemRestriction *restriction, CeedElemRestriction *q_data_restriction) { + CeedInt p = degree + 1; + CeedInt num_nodes = CeedIntPow(p, dim); // number of scalar nodes per element + CeedInt elem_qpts = CeedIntPow(num_qpts, dim); // number of qpts per element + CeedInt nd[3], num_elem = 1, scalar_size = 1; + + for (CeedInt d = 0; d < dim; d++) { + num_elem *= num_xyz[d]; + nd[d] = num_xyz[d] * (p - 1) + 1; + scalar_size *= nd[d]; + } + *size = scalar_size * num_comp; + // elem: 0 1 n-1 + // |---*-...-*---|---*-...-*---|- ... -|--...--| + // num_nodes: 0 1 p-1 p p+1 2*p n*p + CeedInt *elem_nodes = malloc(sizeof(CeedInt) * num_elem * num_nodes); + + for (CeedInt e = 0; e < num_elem; e++) { + CeedInt e_xyz[3] = {1, 1, 1}, re = e; + + for (CeedInt d = 0; d < dim; d++) { + e_xyz[d] = re % num_xyz[d]; + re /= num_xyz[d]; + } + CeedInt *local_elem_nodes = elem_nodes + e * num_nodes; + + for (CeedInt l_nodes = 0; l_nodes < num_nodes; l_nodes++) { + CeedInt g_nodes = 0, g_nodes_stride = 1, r_nodes = l_nodes; + + for (CeedInt d = 0; d < dim; d++) { + g_nodes += (e_xyz[d] * (p - 1) + r_nodes % p) * g_nodes_stride; + g_nodes_stride *= nd[d]; + r_nodes /= p; + } + local_elem_nodes[l_nodes] = g_nodes; + } + } + if (restriction) { + CeedElemRestrictionCreate(ceed, num_elem, num_nodes, num_comp, scalar_size, num_comp * scalar_size, CEED_MEM_HOST, CEED_COPY_VALUES, elem_nodes, + restriction); + } + if (q_data_restriction) { + CeedElemRestrictionCreateStrided(ceed, num_elem, elem_qpts, num_comp, num_comp * elem_qpts * num_elem, CEED_STRIDES_BACKEND, q_data_restriction); + } + free(elem_nodes); + return 0; +} + +int SetCartesianMeshCoords(CeedInt dim, CeedInt num_xyz[dim], CeedInt mesh_degree, CeedVector mesh_coords) { + CeedInt p = mesh_degree + 1; + CeedInt nd[3], scalar_size = 1; + + for (CeedInt d = 0; d < dim; d++) { + nd[d] = num_xyz[d] * (p - 1) + 1; + scalar_size *= nd[d]; + } + CeedScalar *coords; + + CeedVectorGetArrayWrite(mesh_coords, CEED_MEM_HOST, &coords); + CeedScalar *nodes = malloc(sizeof(CeedScalar) * p); + + // The H1 basis uses Lobatto quadrature points as nodes. + CeedLobattoQuadrature(p, nodes, NULL); // nodes are in [-1,1] + for (CeedInt i = 0; i < p; i++) nodes[i] = 0.5 + 0.5 * nodes[i]; + for (CeedInt gs_nodes = 0; gs_nodes < scalar_size; gs_nodes++) { + CeedInt r_nodes = gs_nodes; + + for (CeedInt d = 0; d < dim; d++) { + CeedInt d_1d = r_nodes % nd[d]; + coords[gs_nodes + scalar_size * d] = ((d_1d / (p - 1)) + nodes[d_1d % (p - 1)]) / num_xyz[d]; + r_nodes /= nd[d]; + } + } + free(nodes); + CeedVectorRestoreArray(mesh_coords, &coords); + return 0; +} + +#ifndef M_PI +#define M_PI 3.14159265358979323846 +#define M_PI_2 1.57079632679489661923 +#endif + +CeedScalar TransformMeshCoords(CeedInt dim, CeedInt mesh_size, CeedVector mesh_coords) { + CeedScalar exact_volume; + CeedScalar *coords; + + CeedVectorGetArray(mesh_coords, CEED_MEM_HOST, &coords); + if (dim == 1) { + for (CeedInt i = 0; i < mesh_size; i++) { + // map [0,1] to [0,1] varying the mesh density + coords[i] = 0.5 + 1. / sqrt(3.) * sin((2. / 3.) * M_PI * (coords[i] - 0.5)); + } + exact_volume = 1.; + } else { + CeedInt num_nodes = mesh_size / dim; + for (CeedInt i = 0; i < num_nodes; i++) { + // map (x,y) from [0,1]x[0,1] to the quarter annulus with polar + // coordinates, (r,phi) in [1,2]x[0,pi/2] with area = 3/4*pi + CeedScalar u = coords[i], v = coords[i + num_nodes]; + + u = 1. + u; + v = M_PI_2 * v; + coords[i] = u * cos(v); + coords[i + num_nodes] = u * sin(v); + } + exact_volume = 3. / 4. * M_PI; + } + CeedVectorRestoreArray(mesh_coords, &coords); + return exact_volume; +} diff --git a/examples/ceed/ex3-volume.h b/examples/ceed/ex3-volume.h new file mode 100644 index 0000000000..0d2c0419e4 --- /dev/null +++ b/examples/ceed/ex3-volume.h @@ -0,0 +1,172 @@ +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. +// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +// +// SPDX-License-Identifier: BSD-2-Clause +// +// This file is part of CEED: http://github.com/ceed + +#include + +/// A structure used to pass additional data to f_build_mass_diff +struct BuildContext { + CeedInt dim, space_dim; +}; + +/// libCEED Q-function for building quadrature data for a mass + diffusion operator +CEED_QFUNCTION(build_mass_diff)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { + struct BuildContext *build_data = (struct BuildContext *)ctx; + + // in[0] is Jacobians with shape [dim, dim, Q] + // in[1] is quadrature weights, size (Q) + const CeedScalar *w = in[1]; + CeedScalar(*q_data)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[0]; + + // At every quadrature point, compute w/det(J).adj(J).adj(J)^T and store + // the symmetric part of the result. + switch (build_data->dim + 10 * build_data->space_dim) { + case 11: { + const CeedScalar(*J)[1][CEED_Q_VLA] = (const CeedScalar(*)[1][CEED_Q_VLA])in[0]; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { + // Mass + q_data[0][i] = w[i] * J[0][0][i]; + + // Diffusion + q_data[1][i] = w[i] / J[0][0][i]; + } // End of Quadrature Point Loop + } break; + case 22: { + const CeedScalar(*J)[2][CEED_Q_VLA] = (const CeedScalar(*)[2][CEED_Q_VLA])in[0]; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { + // J: 0 2 q_data: 0 2 adj(J): J22 -J12 + // 1 3 2 1 -J10 J00 + const CeedScalar J00 = J[0][0][i]; + const CeedScalar J10 = J[0][1][i]; + const CeedScalar J01 = J[1][0][i]; + const CeedScalar J11 = J[1][1][i]; + const CeedScalar qw = w[i] / (J00 * J11 - J10 * J01); + + // Mass + q_data[0][i] = w[i] * (J00 * J11 - J10 * J01); + + // Diffusion + q_data[1][i] = qw * (J01 * J01 + J11 * J11); + q_data[2][i] = qw * (J00 * J00 + J10 * J10); + q_data[3][i] = -qw * (J00 * J01 + J10 * J11); + } // End of Quadrature Point Loop + } break; + case 33: { + const CeedScalar(*J)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[0]; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { + // Compute the adjoint + CeedScalar A[3][3]; + + for (CeedInt j = 0; j < 3; j++) { + for (CeedInt k = 0; k < 3; k++) { + // Equivalent code with J as a VLA and no mod operations: + // A[k][j] = J[j+1][k+1]*J[j+2][k+2] - J[j+1][k+2]*J[j+2][k+1] + A[k][j] = + J[(k + 1) % 3][(j + 1) % 3][i] * J[(k + 2) % 3][(j + 2) % 3][i] - J[(k + 2) % 3][(j + 1) % 3][i] * J[(k + 1) % 3][(j + 2) % 3][i]; + } + } + + // Compute quadrature weight / det(J) + const CeedScalar qw = w[i] / (J[0][0][i] * A[0][0] + J[0][1][i] * A[0][1] + J[0][2][i] * A[0][2]); + + // Mass + q_data[0][i] = w[i] * (J[0][0][i] * A[0][0] + J[0][1][i] * A[0][1] + J[0][2][i] * A[0][2]); + + // Diffusion + // Stored in Voigt convention + // 1 6 5 + // 6 2 4 + // 5 4 3 + q_data[1][i] = qw * (A[0][0] * A[0][0] + A[0][1] * A[0][1] + A[0][2] * A[0][2]); + q_data[2][i] = qw * (A[1][0] * A[1][0] + A[1][1] * A[1][1] + A[1][2] * A[1][2]); + q_data[3][i] = qw * (A[2][0] * A[2][0] + A[2][1] * A[2][1] + A[2][2] * A[2][2]); + q_data[4][i] = qw * (A[1][0] * A[2][0] + A[1][1] * A[2][1] + A[1][2] * A[2][2]); + q_data[5][i] = qw * (A[0][0] * A[2][0] + A[0][1] * A[2][1] + A[0][2] * A[2][2]); + q_data[6][i] = qw * (A[0][0] * A[1][0] + A[0][1] * A[1][1] + A[0][2] * A[1][2]); + } // End of Quadrature Point Loop + } break; + } + return CEED_ERROR_SUCCESS; +} + +/// libCEED Q-function for applying a mass + diffusion operator +CEED_QFUNCTION(apply_mass_diff)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { + struct BuildContext *build_data = (struct BuildContext *)ctx; + + // in[1], out[1] solution values with shape [1, 1, Q] + // in[1], out[1] solution gradients with shape [dim, 1, Q] + // in[2] is quadrature data with shape [num_components, Q] + const CeedScalar(*q_data)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[2]; + + switch (build_data->dim) { + case 1: { + const CeedScalar *u = in[0], *ug = in[1]; + CeedScalar *v = out[0], *vg = out[1]; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { + // Mass + v[i] = q_data[0][i] * u[i]; + + // Diffusion + vg[i] = q_data[1][i] * ug[i]; + } // End of Quadrature Point Loop + } break; + case 2: { + const CeedScalar *u = in[0]; + const CeedScalar(*ug)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[1]; + CeedScalar *v = out[0]; + CeedScalar(*vg)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[1]; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { + // Mass + v[i] = q_data[0][i] * u[i]; + + // Diffusion + // Read q_data (dXdxdXdx_T symmetric matrix) + // Stored in Voigt convention + // 1 3 + // 23 2 + const CeedScalar dXdxdXdx_T[2][2] = { + {q_data[1][i], q_data[3][i]}, + {q_data[3][i], q_data[2][i]} + }; + + // j = direction of vg + for (int j = 0; j < 2; j++) vg[j][i] = (ug[0][i] * dXdxdXdx_T[0][j] + ug[1][i] * dXdxdXdx_T[1][j]); + } // End of Quadrature Point Loop + } break; + case 3: { + const CeedScalar *u = in[0]; + const CeedScalar(*ug)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[1]; + CeedScalar *v = out[0]; + CeedScalar(*vg)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[1]; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { + // Mass + v[i] = q_data[0][i] * u[i]; + + // Diffusion + // Read q_data (dXdxdXdx_T symmetric matrix) + // Stored in Voigt convention + // 1 6 5 + // 6 2 4 + // 5 4 3 + const CeedScalar dXdxdXdx_T[3][3] = { + {q_data[1][i], q_data[6][i], q_data[5][i]}, + {q_data[6][i], q_data[2][i], q_data[4][i]}, + {q_data[5][i], q_data[4][i], q_data[3][i]} + }; + + // j = direction of vg + for (int j = 0; j < 3; j++) vg[j][i] = (ug[0][i] * dXdxdXdx_T[0][j] + ug[1][i] * dXdxdXdx_T[1][j] + ug[2][i] * dXdxdXdx_T[2][j]); + } // End of Quadrature Point Loop + } break; + } + return CEED_ERROR_SUCCESS; +} diff --git a/examples/ceed/index.md b/examples/ceed/index.md index 5d2d7a3807..02b0f41749 100644 --- a/examples/ceed/index.md +++ b/examples/ceed/index.md @@ -1,45 +1,39 @@ # Standalone libCEED -The following two examples have no dependencies, and are designed to be self-contained. -For additional examples that use external discretization libraries (MFEM, PETSc, Nek5000 -etc.) see the subdirectories in {file}`examples/`. +The following three examples have no dependencies, and are designed to be self-contained. +For additional examples that use external discretization libraries (MFEM, PETSc, Nek5000 etc.) see the subdirectories in {file}`examples/`. (ex1-volume)= ## Ex1-Volume -This example is located in the subdirectory {file}`examples/ceed`. It illustrates a -simple usage of libCEED to compute the volume of a given body using a matrix-free -application of the mass operator. Arbitrary mesh and solution orders in 1D, 2D, and 3D -are supported from the same code. +This example is located in the subdirectory {file}`examples/ceed`. +It illustrates a simple usage of libCEED to compute the volume of a given body using a matrix-free application of the mass operator. +Arbitrary mesh and solution orders in 1D, 2D, and 3D are supported from the same code. -This example shows how to compute line/surface/volume integrals of a 1D, 2D, or 3D -domain $\Omega$ respectively, by applying the mass operator to a vector of -$1$s. It computes: +This example shows how to compute line/surface/volume integrals of a 1D, 2D, or 3D domain $\Omega$ respectively, by applying the mass operator to a vector of $1$s. +It computes: $$ I = \int_{\Omega} 1 \, dV . $$ (eq-ex1-volume) -Using the same notation as in {ref}`theoretical-framework`, we write here the vector -$u(x)\equiv 1$ in the Galerkin approximation, -and find the volume of $\Omega$ as +Using the same notation as in {ref}`theoretical-framework`, we write here the vector $u(x)\equiv 1$ in the Galerkin approximation, and find the volume of $\Omega$ as $$ \sum_e \int_{\Omega_e} v(x) 1 \, dV $$ (volume-sum) -with $v(x) \in \mathcal{V}_p = \{ v \in H^{1}(\Omega_e) \,|\, v \in P_p(\bm{I}), e=1,\ldots,N_e \}$, -the test functions. +with $v(x) \in \mathcal{V}_p = \{ v \in H^{1}(\Omega_e) \,|\, v \in P_p(\bm{I}), e=1,\ldots,N_e \}$, the test functions. (ex2-surface)= ## Ex2-Surface -This example is located in the subdirectory {file}`examples/ceed`. It computes the -surface area of a given body using matrix-free application of a diffusion operator. -Similar to {ref}`Ex1-Volume`, arbitrary mesh and solution orders in 1D, 2D, and 3D -are supported from the same code. It computes: +This example is located in the subdirectory {file}`examples/ceed`. +It computes the surface area of a given body using matrix-free application of a diffusion operator. +Similar to {ref}`Ex1-Volume`, arbitrary mesh and solution orders in 1D, 2D, and 3D are supported from the same code. +It computes: $$ I = \int_{\partial \Omega} 1 \, dS , @@ -65,3 +59,29 @@ Since we have chosen $u$ such that $\nabla u \cdot \hat{\bm n} = 1$, the boundar $$ \int_\Omega \nabla v \cdot \nabla u \, dV \approx \sum_e \int_{\partial \Omega_e} v(x) 1 \, dS . $$ + +(ex3-volume)= + +## Ex3-Volume + +This example is located in the subdirectory {file}`examples/ceed`. +It illustrates a more complex usage of libCEED to compute the volume of a given body using a matrix-free application of the screened Poisson operator. +Arbitrary mesh and solution orders in 1D, 2D, and 3D are supported from the same code. + +This example shows how to compute line/surface/volume integrals of a 1D, 2D, or 3D domain $\Omega$ respectively, by applying the screened Poisson operator to a vector of $1$s. +It computes: + +$$ +I = \int_{\Omega} \left( 1 + \nabla^2 1 \right) \, dV . +$$ (eq-ex3-volume) + +Using the same notation as in {ref}`theoretical-framework`, we write here the vector $u(x)\equiv 1$ in the Galerkin approximation, and find the volume of $\Omega$ as + +$$ +\sum_e \int_{\Omega_e}\left( v(x) 1 + \nabla v(x) \cdot 0 \right) \, dV +$$ (volume-sum-mass-diff) + +with $v(x) \in \mathcal{V}_p = \{ v \in H^{1}(\Omega_e) \,|\, v \in P_p(\bm{I}), e=1,\ldots,N_e \}$, the test functions. + +The addition of the Poisson term is not needed to compute the volume of the region, as shown in example 1. +Rather, this example illustrates the ability to add multiple evaluation modes for the same input or output vector in a libCEED operator. diff --git a/examples/deal.II/CMakeLists.txt b/examples/deal.II/CMakeLists.txt index 272facfc00..d5de2d7ddb 100644 --- a/examples/deal.II/CMakeLists.txt +++ b/examples/deal.II/CMakeLists.txt @@ -1,4 +1,4 @@ -CMAKE_MINIMUM_REQUIRED(VERSION 2.8.8) +CMAKE_MINIMUM_REQUIRED(VERSION 3.10.0) FIND_PACKAGE(deal.II 8.0 QUIET HINTS ${deal.II_DIR} ${DEAL_II_DIR} ../ ../../ $ENV{DEAL_II_DIR} @@ -11,13 +11,21 @@ IF(NOT ${deal.II_FOUND}) ) ENDIF() -DEAL_II_INITIALIZE_CACHED_VARIABLES() -PROJECT("bps") +FILE(GLOB SOURCE_FILES "*.cc") -DEAL_II_INITIALIZE_CACHED_VARIABLES() +FOREACH ( source_file ${SOURCE_FILES} ) + GET_FILENAME_COMPONENT(file_name ${source_file} NAME) + STRING( REPLACE ".cc" "" exec ${file_name} ) -ADD_EXECUTABLE(bps bps.cc) -DEAL_II_SETUP_TARGET(bps) + DEAL_II_INITIALIZE_CACHED_VARIABLES() + PROJECT(${exec}) -TARGET_INCLUDE_DIRECTORIES(bps PUBLIC ${CEED_DIR}/include) -TARGET_LINK_LIBRARIES(bps ${CEED_DIR}/lib/libceed.so) + DEAL_II_INITIALIZE_CACHED_VARIABLES() + + ADD_EXECUTABLE(${exec} ${source_file}) + DEAL_II_SETUP_TARGET(${exec}) + + TARGET_INCLUDE_DIRECTORIES(${exec} PUBLIC ${CEED_DIR}/include) + TARGET_LINK_LIBRARIES(${exec} ${CEED_DIR}/lib/libceed.so) + +ENDFOREACH ( source_file ${SOURCE_FILES} ) diff --git a/examples/deal.II/README.MD b/examples/deal.II/README.md similarity index 59% rename from examples/deal.II/README.MD rename to examples/deal.II/README.md index cd3f14a3cb..18dba6dd7c 100644 --- a/examples/deal.II/README.MD +++ b/examples/deal.II/README.md @@ -1,6 +1,7 @@ -An example how to write libCEED operators (BP1-BP6) within the open-source -finite element library [deal.II](https://www.dealii.org/). As reference, -operators are presented that use the native matrix-free infrastructure. +## libCEED deal.II Example + +An example how to write libCEED operators (BP1-BP6) within the open-source finite element library [deal.II](https://www.dealii.org/). +As reference, operators are presented that use the native matrix-free infrastructure. First compile deal.II and libCEED individually. After that, compile the deal.II example: @@ -11,10 +12,14 @@ cmake ../ -DDEAL_II_DIR=~/path/to/dealii -DCEED_DIR=~/path/to/libceed make ``` -To run the executable, write: +To run the executables, write: + +``` +./bps_cpu +``` ``` -./bps +./bps_kokkos ``` Optional command-line arguments are shown by adding the command-line argument "--help". diff --git a/examples/deal.II/bps-ceed.h b/examples/deal.II/bps-ceed.h new file mode 100644 index 0000000000..f9041d4c6f --- /dev/null +++ b/examples/deal.II/bps-ceed.h @@ -0,0 +1,648 @@ +// --------------------------------------------------------------------- +// +// Copyright (C) 2023 by the deal.II authors +// +// This file is part of the deal.II library. +// +// The deal.II library is free software; you can use it, redistribute +// it, and/or modify it under the terms of the GNU Lesser General +// Public License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// The full text of the license can be found in the file LICENSE.md at +// the top level directory of deal.II. +// +// Authors: Peter Munch, Martin Kronbichler +// +// --------------------------------------------------------------------- + +#pragma once +#ifndef bps_ceed_h +# define bps_ceed_h + +// deal.II includes +# include + +# include + +# include + +# include +# include +# include +# include + +// local includes +# include "bps.h" + +// libCEED includes +# include +# include + +// QFunction source +# include "bps-qfunctions.h" + +using namespace dealii; + + +/** + * Operator implementation using libCEED. + */ +template +class OperatorCeed : public OperatorBase +{ +public: + using VectorType = typename OperatorBase::VectorType; + + /** + * Constructor. + */ + OperatorCeed(const Mapping &mapping, + const DoFHandler &dof_handler, + const AffineConstraints &constraints, + const Quadrature &quadrature, + const BPType &bp, + const std::string &resource) + : mapping(mapping) + , dof_handler(dof_handler) + , constraints(constraints) + , quadrature(quadrature) + , bp(bp) + , resource(resource) + { + reinit(); + } + + /** + * Destructor. + */ + ~OperatorCeed() + { + CeedVectorDestroy(&src_ceed); + CeedVectorDestroy(&dst_ceed); + CeedOperatorDestroy(&op_apply); + CeedDestroy(&ceed); + } + + /** + * Initialized internal data structures, particularly, libCEED. + */ + void + reinit() override + { + CeedVector metric_data; + CeedBasis sol_basis; + CeedElemRestriction sol_restriction; + CeedElemRestriction metric_data_restriction; + BuildContext build_ctx_data; + CeedQFunctionContext build_ctx; + CeedQFunction qf_apply; + + const auto &tria = dof_handler.get_triangulation(); + const auto &fe = dof_handler.get_fe(); + + const auto n_components = fe.n_components(); + + if (bp == BPType::BP1 || bp == BPType::BP3 || bp == BPType::BP5) + { + AssertThrow(n_components == 1, ExcInternalError()); + } + else + { + AssertThrow(n_components == dim, ExcInternalError()); + } + + // 1) create CEED instance -> "MatrixFree" + const char *ceed_spec = resource.c_str(); + CeedInit(ceed_spec, &ceed); + + // 2) create shape functions -> "ShapeInfo" + const unsigned int fe_degree = fe.tensor_degree(); + const unsigned int n_q_points = quadrature.get_tensor_basis()[0].size(); + { + const dealii::internal::MatrixFreeFunctions::ShapeInfo shape_info(quadrature, fe, 0); + const auto &shape_data = shape_info.get_shape_data(); + std::vector q_ref_1d; + for (const auto q : shape_data.quadrature.get_points()) + q_ref_1d.push_back(q(0)); + + // transpose bases for compatibility with restriction + std::vector interp_1d(shape_data.shape_values.size()); + std::vector grad_1d(shape_data.shape_gradients.size()); + for (unsigned int i = 0; i < n_q_points; ++i) + for (unsigned int j = 0; j < fe_degree + 1; ++j) + { + interp_1d[j + i * (fe_degree + 1)] = shape_data.shape_values[j * n_q_points + i]; + grad_1d[j + i * (fe_degree + 1)] = shape_data.shape_gradients[j * n_q_points + i]; + } + + CeedBasisCreateTensorH1(ceed, + dim, + n_components, + fe_degree + 1, + n_q_points, + interp_1d.data(), + grad_1d.data(), + q_ref_1d.data(), + quadrature.get_tensor_basis()[0].get_weights().data(), + &sol_basis); + } + + // 3) create restriction matrix -> DoFInfo + unsigned int n_local_active_cells = 0; + + for (const auto &cell : dof_handler.active_cell_iterators()) + if (cell->is_locally_owned()) + n_local_active_cells++; + + partitioner = + std::make_shared(dof_handler.locally_owned_dofs(), + DoFTools::extract_locally_active_dofs( + dof_handler), + dof_handler.get_communicator()); + + std::vector indices; + indices.reserve(n_local_active_cells * fe.n_dofs_per_cell() / n_components); + + const auto dof_mapping = FETools::lexicographic_to_hierarchic_numbering(fe_degree); + + std::vector local_indices(fe.n_dofs_per_cell()); + + for (const auto &cell : dof_handler.active_cell_iterators()) + if (cell->is_locally_owned()) + { + cell->get_dof_indices(local_indices); + + for (const auto i : dof_mapping) + indices.emplace_back( + partitioner->global_to_local(local_indices[fe.component_to_system_index(0, i)])); + } + + CeedElemRestrictionCreate(ceed, + n_local_active_cells, + fe.n_dofs_per_cell() / n_components, + n_components, + 1, + this->extended_local_size(), + CEED_MEM_HOST, + CEED_COPY_VALUES, + indices.data(), + &sol_restriction); + + // 4) create mapping -> MappingInfo + const unsigned int n_components_metric = (bp <= BPType::BP2) ? 1 : (dim * (dim + 1) / 2); + + metric_data_raw = compute_metric_data(ceed, mapping, tria, quadrature, bp); + + strides = {{1, + static_cast(quadrature.size()), + static_cast(quadrature.size() * n_components_metric)}}; + CeedVectorCreate(ceed, metric_data_raw.size(), &metric_data); + CeedVectorSetArray(metric_data, CEED_MEM_HOST, CEED_USE_POINTER, metric_data_raw.data()); + CeedElemRestrictionCreateStrided(ceed, + n_local_active_cells, + quadrature.size(), + n_components_metric, + metric_data_raw.size(), + strides.data(), + &metric_data_restriction); + + build_ctx_data.dim = dim; + build_ctx_data.space_dim = dim; + + CeedQFunctionContextCreate(ceed, &build_ctx); + CeedQFunctionContextSetData( + build_ctx, CEED_MEM_HOST, CEED_COPY_VALUES, sizeof(build_ctx_data), &build_ctx_data); + + // 5) create q operation + if (bp == BPType::BP1) + CeedQFunctionCreateInterior(ceed, 1, f_apply_mass, f_apply_mass_loc, &qf_apply); + else if (bp == BPType::BP2) + CeedQFunctionCreateInterior(ceed, 1, f_apply_mass_vec, f_apply_mass_vec_loc, &qf_apply); + else if (bp == BPType::BP3 || bp == BPType::BP5) + CeedQFunctionCreateInterior(ceed, 1, f_apply_poisson, f_apply_poisson_loc, &qf_apply); + else if (bp == BPType::BP4 || bp == BPType::BP6) + CeedQFunctionCreateInterior(ceed, 1, f_apply_poisson_vec, f_apply_poisson_vec_loc, &qf_apply); + else + AssertThrow(false, ExcInternalError()); + + if (bp <= BPType::BP2) + CeedQFunctionAddInput(qf_apply, "u", n_components, CEED_EVAL_INTERP); + else + CeedQFunctionAddInput(qf_apply, "u", dim * n_components, CEED_EVAL_GRAD); + + CeedQFunctionAddInput(qf_apply, "metric data", n_components_metric, CEED_EVAL_NONE); + + if (bp <= BPType::BP2) + CeedQFunctionAddOutput(qf_apply, "v", n_components, CEED_EVAL_INTERP); + else + CeedQFunctionAddOutput(qf_apply, "v", dim * n_components, CEED_EVAL_GRAD); + + CeedQFunctionSetContext(qf_apply, build_ctx); + + // 6) put everything together + CeedOperatorCreate(ceed, qf_apply, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_apply); + + CeedOperatorSetField(op_apply, "u", sol_restriction, sol_basis, CEED_VECTOR_ACTIVE); + CeedOperatorSetField( + op_apply, "metric data", metric_data_restriction, CEED_BASIS_NONE, metric_data); + CeedOperatorSetField(op_apply, "v", sol_restriction, sol_basis, CEED_VECTOR_ACTIVE); + + // 7) libCEED vectors + CeedElemRestrictionCreateVector(sol_restriction, &src_ceed, NULL); + CeedElemRestrictionCreateVector(sol_restriction, &dst_ceed, NULL); + + // 8) cleanup + CeedVectorDestroy(&metric_data); + CeedElemRestrictionDestroy(&metric_data_restriction); + CeedElemRestrictionDestroy(&sol_restriction); + CeedBasisDestroy(&sol_basis); + CeedQFunctionContextDestroy(&build_ctx); + CeedQFunctionDestroy(&qf_apply); + } + + /** + * Perform matrix-vector product. + */ + void + vmult(VectorType &dst, const VectorType &src) const override + { + // communicate: update ghost values + src.update_ghost_values(); + + // pass memory buffers to libCEED + VectorTypeCeed x(src_ceed); + VectorTypeCeed y(dst_ceed); + x.import_array(src, CEED_MEM_HOST); + y.import_array(dst, CEED_MEM_HOST); + + // apply operator + CeedOperatorApply(op_apply, x(), y(), CEED_REQUEST_IMMEDIATE); + + // pull arrays back to deal.II + x.take_array(); + y.take_array(); + + // communicate: compress + src.zero_out_ghost_values(); + dst.compress(VectorOperation::add); + + // apply constraints: we assume homogeneous DBC + constraints.set_zero(dst); + } + + /** + * Initialized vector. + */ + void + initialize_dof_vector(VectorType &vec) const override + { + vec.reinit(partitioner); + } + + /** + * Compute inverse of diagonal. + */ + void + compute_inverse_diagonal(VectorType &diagonal) const override + { + this->initialize_dof_vector(diagonal); + + // pass memory buffer to libCEED + VectorTypeCeed y(dst_ceed); + y.import_array(diagonal, CEED_MEM_HOST); + + CeedOperatorLinearAssembleDiagonal(op_apply, y(), CEED_REQUEST_IMMEDIATE); + + // pull array back to deal.II + y.take_array(); + + diagonal.compress(VectorOperation::add); + + // apply constraints: we assume homogeneous DBC + constraints.set_zero(diagonal); + + for (auto &i : diagonal) + i = (std::abs(i) > 1.0e-10) ? (1.0 / i) : 1.0; + } + +private: + /** + * Wrapper around a deal.II vector to create a libCEED vector view. + */ + class VectorTypeCeed + { + public: + /** + * Constructor. + */ + VectorTypeCeed(const CeedVector &vec_orig) + { + vec_ceed = NULL; + CeedVectorReferenceCopy(vec_orig, &vec_ceed); + } + + /** + * Return libCEED vector view. + */ + CeedVector & + operator()() + { + return vec_ceed; + } + + /** + * Set deal.II memory in libCEED vector. + */ + void + import_array(const VectorType &vec, const CeedMemType space) + { + mem_space = space; + CeedVectorSetArray(vec_ceed, mem_space, CEED_USE_POINTER, vec.get_values()); + } + + /** + * Sync memory from device to host. + */ + void + sync_array() + { + CeedVectorSyncArray(vec_ceed, mem_space); + } + + /** + * Take previously set deal.II array from libCEED vector + */ + void + take_array() + { + CeedScalar *ptr; + CeedVectorTakeArray(vec_ceed, mem_space, &ptr); + } + + /** + * Destructor: destroy vector view. + */ + ~VectorTypeCeed() + { + bool has_array; + CeedVectorHasBorrowedArrayOfType(vec_ceed, mem_space, &has_array); + if (has_array) + { + CeedScalar *ptr; + CeedVectorTakeArray(vec_ceed, mem_space, &ptr); + } + CeedVectorDestroy(&vec_ceed); + } + + private: + /** + * libCEED vector view. + */ + CeedMemType mem_space; + CeedVector vec_ceed; + }; + + /** + * Number of locally active DoFs. + */ + unsigned int + extended_local_size() const + { + return partitioner->locally_owned_size() + partitioner->n_ghost_indices(); + } + + /** + * Compute metric data: Jacobian, ... + */ + static std::vector + compute_metric_data(const Ceed &ceed, + const Mapping &mapping, + const Triangulation &tria, + const Quadrature &quadrature, + const BPType bp) + { + std::vector metric_data_raw; + + CeedBasis geo_basis; + CeedVector metric_data; + CeedElemRestriction metric_data_restriction; + CeedVector node_coords; + CeedElemRestriction geo_restriction; + CeedQFunctionContext build_ctx; + CeedQFunction qf_build; + CeedOperator op_build; + + const unsigned int n_q_points = quadrature.get_tensor_basis()[0].size(); + + const unsigned int n_components_metric = (bp <= BPType::BP2) ? 1 : (dim * (dim + 1) / 2); + + const auto mapping_q = dynamic_cast *>(&mapping); + + AssertThrow(mapping_q, ExcMessage("Wrong mapping!")); + + const unsigned int fe_degree = mapping_q->get_degree(); + + FE_Q geo_fe(fe_degree); + + { + const dealii::internal::MatrixFreeFunctions::ShapeInfo shape_info(quadrature, + geo_fe, + 0); + const auto &shape_data = shape_info.get_shape_data(); + std::vector q_ref_1d; + for (const auto q : shape_data.quadrature.get_points()) + q_ref_1d.push_back(q(0)); + + // transpose bases for compatibility with restriction + std::vector interp_1d(shape_data.shape_values.size()); + std::vector grad_1d(shape_data.shape_gradients.size()); + for (unsigned int i = 0; i < n_q_points; ++i) + for (unsigned int j = 0; j < fe_degree + 1; ++j) + { + interp_1d[j + i * (fe_degree + 1)] = shape_data.shape_values[j * n_q_points + i]; + grad_1d[j + i * (fe_degree + 1)] = shape_data.shape_gradients[j * n_q_points + i]; + } + + CeedBasisCreateTensorH1(ceed, + dim, + dim, + fe_degree + 1, + n_q_points, + interp_1d.data(), + grad_1d.data(), + q_ref_1d.data(), + quadrature.get_tensor_basis()[0].get_weights().data(), + &geo_basis); + } + + unsigned int n_local_active_cells = 0; + + for (const auto &cell : tria.active_cell_iterators()) + if (cell->is_locally_owned()) + n_local_active_cells++; + + std::vector geo_support_points; + std::vector geo_indices; + + DoFHandler geo_dof_handler(tria); + geo_dof_handler.distribute_dofs(geo_fe); + + const auto geo_partitioner = + std::make_shared(geo_dof_handler.locally_owned_dofs(), + DoFTools::extract_locally_active_dofs( + geo_dof_handler), + geo_dof_handler.get_communicator()); + + geo_indices.reserve(n_local_active_cells * geo_fe.n_dofs_per_cell()); + + const auto dof_mapping = FETools::lexicographic_to_hierarchic_numbering(fe_degree); + + FEValues fe_values(mapping, + geo_fe, + geo_fe.get_unit_support_points(), + update_quadrature_points); + + std::vector local_indices(geo_fe.n_dofs_per_cell()); + + const unsigned int n_points = + geo_partitioner->locally_owned_size() + geo_partitioner->n_ghost_indices(); + + geo_support_points.resize(dim * n_points); + + for (const auto &cell : geo_dof_handler.active_cell_iterators()) + if (cell->is_locally_owned()) + { + fe_values.reinit(cell); + cell->get_dof_indices(local_indices); + + for (const auto i : dof_mapping) + { + const auto index = geo_partitioner->global_to_local(local_indices[i]); + geo_indices.emplace_back(index * dim); + + const auto point = fe_values.quadrature_point(i); + + for (unsigned int d = 0; d < dim; ++d) + geo_support_points[index * dim + d] = point[d]; + } + } + + metric_data_raw.resize(n_local_active_cells * quadrature.size() * n_components_metric); + + CeedInt strides[3] = {1, + static_cast(quadrature.size()), + static_cast(quadrature.size() * n_components_metric)}; + + CeedVectorCreate(ceed, metric_data_raw.size(), &metric_data); + CeedVectorSetArray(metric_data, CEED_MEM_HOST, CEED_USE_POINTER, metric_data_raw.data()); + CeedElemRestrictionCreateStrided(ceed, + n_local_active_cells, + quadrature.size(), + n_components_metric, + metric_data_raw.size(), + strides, + &metric_data_restriction); + + CeedVectorCreate(ceed, geo_support_points.size(), &node_coords); + CeedVectorSetArray(node_coords, CEED_MEM_HOST, CEED_USE_POINTER, geo_support_points.data()); + + CeedElemRestrictionCreate(ceed, + n_local_active_cells, + geo_fe.n_dofs_per_cell(), + dim, + 1, + geo_support_points.size(), + CEED_MEM_HOST, + CEED_COPY_VALUES, + geo_indices.data(), + &geo_restriction); + + BuildContext build_ctx_data; + build_ctx_data.dim = dim; + build_ctx_data.space_dim = dim; + + CeedQFunctionContextCreate(ceed, &build_ctx); + CeedQFunctionContextSetData( + build_ctx, CEED_MEM_HOST, CEED_USE_POINTER, sizeof(build_ctx_data), &build_ctx_data); + + // 5) create q operation + if (bp <= BPType::BP2) + CeedQFunctionCreateInterior(ceed, 1, f_build_mass, f_build_mass_loc, &qf_build); + else + CeedQFunctionCreateInterior(ceed, 1, f_build_poisson, f_build_poisson_loc, &qf_build); + + CeedQFunctionAddInput(qf_build, "geo", dim * dim, CEED_EVAL_GRAD); + CeedQFunctionAddInput(qf_build, "weight", 1, CEED_EVAL_WEIGHT); + CeedQFunctionAddOutput(qf_build, "metric data", n_components_metric, CEED_EVAL_NONE); + CeedQFunctionSetContext(qf_build, build_ctx); + + // 6) put everything together + CeedOperatorCreate(ceed, qf_build, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_build); + CeedOperatorSetField(op_build, "geo", geo_restriction, geo_basis, CEED_VECTOR_ACTIVE); + CeedOperatorSetField( + op_build, "weight", CEED_ELEMRESTRICTION_NONE, geo_basis, CEED_VECTOR_NONE); + CeedOperatorSetField( + op_build, "metric data", metric_data_restriction, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE); + + CeedOperatorApply(op_build, node_coords, metric_data, CEED_REQUEST_IMMEDIATE); + + CeedVectorDestroy(&node_coords); + CeedVectorSyncArray(metric_data, CEED_MEM_HOST); + CeedVectorDestroy(&metric_data); + CeedElemRestrictionDestroy(&geo_restriction); + CeedElemRestrictionDestroy(&metric_data_restriction); + CeedBasisDestroy(&geo_basis); + CeedQFunctionContextDestroy(&build_ctx); + CeedQFunctionDestroy(&qf_build); + CeedOperatorDestroy(&op_build); + + return metric_data_raw; + } + + /** + * Mapping object passed to the constructor. + */ + const Mapping &mapping; + + /** + * DoFHandler object passed to the constructor. + */ + const DoFHandler &dof_handler; + + /** + * Constraints object passed to the constructor. + */ + const AffineConstraints &constraints; + + /** + * Quadrature rule object passed to the constructor. + */ + const Quadrature &quadrature; + + /** + * Selected BP. + */ + const BPType bp; + + /** + * Resource name. + */ + const std::string resource; + + /** + * Partitioner for distributed vectors. + */ + std::shared_ptr partitioner; + + /** + * libCEED data structures. + */ + Ceed ceed; + std::vector metric_data_raw; + std::array strides; + CeedVector src_ceed; + CeedVector dst_ceed; + CeedOperator op_apply; +}; + +#endif diff --git a/examples/deal.II/bps.cc b/examples/deal.II/bps-cpu.cc similarity index 97% rename from examples/deal.II/bps.cc rename to examples/deal.II/bps-cpu.cc index 9d72710d65..2355078ccf 100644 --- a/examples/deal.II/bps.cc +++ b/examples/deal.II/bps-cpu.cc @@ -46,7 +46,8 @@ #include // include operators -#include "bps.h" +#include "bps-ceed.h" +#include "bps-cpu.h" // Test cases //TESTARGS(name="BP1") --resource {ceed_resource} --bp BP1 --fe_degree 2 --print_timings 0 @@ -61,7 +62,7 @@ struct Parameters unsigned int n_global_refinements = 1; unsigned int fe_degree = 2; bool print_timings = true; - std::string libCEED_resource = "/cpu/self/avx/blocked"; + std::string libCEED_resource = "/cpu/self"; bool parse(int argc, char *argv[]) @@ -167,7 +168,7 @@ main(int argc, char *argv[]) #ifdef DEAL_II_WITH_P4EST parallel::distributed::Triangulation tria(MPI_COMM_WORLD); #else - parallel::shared::Triangulation tria(MPI_COMM_WORLD, ::Triangulation::none, true); + Triangulation tria; #endif GridGenerator::hyper_cube(tria); @@ -176,6 +177,8 @@ main(int argc, char *argv[]) DoFHandler dof_handler(tria); dof_handler.distribute_dofs(fe); + DoFRenumbering::support_point_wise(dof_handler); + AffineConstraints constraints; if (!(bp == BPType::BP1 || bp == BPType::BP2)) @@ -185,8 +188,6 @@ main(int argc, char *argv[]) constraints.close(); } - DoFRenumbering::support_point_wise(dof_handler); - const auto test = [&](const std::string &label, const auto &op) { (void)label; diff --git a/examples/deal.II/bps-cpu.h b/examples/deal.II/bps-cpu.h new file mode 100644 index 0000000000..71c00cea5d --- /dev/null +++ b/examples/deal.II/bps-cpu.h @@ -0,0 +1,219 @@ +// --------------------------------------------------------------------- +// +// Copyright (C) 2023 by the deal.II authors +// +// This file is part of the deal.II library. +// +// The deal.II library is free software; you can use it, redistribute +// it, and/or modify it under the terms of the GNU Lesser General +// Public License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// The full text of the license can be found in the file LICENSE.md at +// the top level directory of deal.II. +// +// Authors: Peter Munch, Martin Kronbichler +// +// --------------------------------------------------------------------- + +#pragma once +#ifndef bps_cpu_h +# define bps_cpu_h + +// deal.II includes +# include + +# include + +# include + +# include +# include +# include +# include + +// local includes +# include "bps.h" + +using namespace dealii; + + + +/** + * Operator CPU implementation using deal.II. + */ +template +class OperatorDealii : public OperatorBase +{ +public: + using VectorType = typename OperatorBase::VectorType; + + /** + * Constructor. + */ + OperatorDealii(const Mapping &mapping, + const DoFHandler &dof_handler, + const AffineConstraints &constraints, + const Quadrature &quadrature, + const BPType &bp) + : mapping(mapping) + , dof_handler(dof_handler) + , constraints(constraints) + , quadrature(quadrature) + , bp(bp) + { + reinit(); + } + + /** + * Destructor. + */ + ~OperatorDealii() = default; + + /** + * Initialized internal data structures, particularly, MatrixFree. + */ + void + reinit() override + { + // configure MatrixFree + typename MatrixFree::AdditionalData additional_data; + additional_data.tasks_parallel_scheme = + MatrixFree::AdditionalData::TasksParallelScheme::none; + + // create MatrixFree + matrix_free.reinit(mapping, dof_handler, constraints, quadrature, additional_data); + } + + /** + * Matrix-vector product. + */ + void + vmult(VectorType &dst, const VectorType &src) const override + { + if (dof_handler.get_fe().n_components() == 1) + { + matrix_free.cell_loop(&OperatorDealii::do_cell_integral_range<1>, this, dst, src, true); + } + else + { + AssertThrow(dof_handler.get_fe().n_components() == dim, ExcInternalError()); + + matrix_free.cell_loop(&OperatorDealii::do_cell_integral_range, this, dst, src, true); + } + } + + /** + * Initialize vector. + */ + void + initialize_dof_vector(VectorType &vec) const override + { + matrix_free.initialize_dof_vector(vec); + } + + /** + * Compute inverse of diagonal. + */ + void + compute_inverse_diagonal(VectorType &diagonal) const override + { + this->initialize_dof_vector(diagonal); + + if (dof_handler.get_fe().n_components() == 1) + { + MatrixFreeTools::compute_diagonal(matrix_free, + diagonal, + &OperatorDealii::do_cell_integral_local<1>, + this); + } + else + { + AssertThrow(dof_handler.get_fe().n_components() == dim, ExcInternalError()); + + MatrixFreeTools::compute_diagonal(matrix_free, + diagonal, + &OperatorDealii::do_cell_integral_local, + this); + } + + for (auto &i : diagonal) + i = (std::abs(i) > 1.0e-10) ? (1.0 / i) : 1.0; + } + +private: + /** + * Cell integral without vector access. + */ + template + void + do_cell_integral_local(FEEvaluation &phi) const + { + if (bp <= BPType::BP2) // mass matrix + { + phi.evaluate(EvaluationFlags::values); + for (const auto q : phi.quadrature_point_indices()) + phi.submit_value(phi.get_value(q), q); + phi.integrate(EvaluationFlags::values); + } + else // Poisson operator + { + phi.evaluate(EvaluationFlags::gradients); + for (const auto q : phi.quadrature_point_indices()) + phi.submit_gradient(phi.get_gradient(q), q); + phi.integrate(EvaluationFlags::gradients); + } + } + + /** + * Cell integral on a range of cells. + */ + template + void + do_cell_integral_range(const MatrixFree &matrix_free, + VectorType &dst, + const VectorType &src, + const std::pair &range) const + { + FEEvaluation phi(matrix_free, range); + + for (unsigned cell = range.first; cell < range.second; ++cell) + { + phi.reinit(cell); + phi.read_dof_values(src); // read source vector + do_cell_integral_local(phi); // cell integral + phi.distribute_local_to_global(dst); // write to destination vector + } + } + + /** + * Mapping object passed to the constructor. + */ + const Mapping &mapping; + + /** + * DoFHandler object passed to the constructor. + */ + const DoFHandler &dof_handler; + + /** + * Constraints object passed to the constructor. + */ + const AffineConstraints &constraints; + + /** + * Quadrature rule object passed to the constructor. + */ + const Quadrature &quadrature; + + /** + * Selected BP. + */ + const BPType bp; + + /** + * MatrixFree object. + */ + MatrixFree matrix_free; +}; + +#endif diff --git a/examples/deal.II/bps-kokkos.cc b/examples/deal.II/bps-kokkos.cc new file mode 100644 index 0000000000..86ef1a1693 --- /dev/null +++ b/examples/deal.II/bps-kokkos.cc @@ -0,0 +1,251 @@ +// --------------------------------------------------------------------- +// +// Copyright (C) 2023 by the deal.II authors +// +// This file is part of the deal.II library. +// +// The deal.II library is free software; you can use it, redistribute +// it, and/or modify it under the terms of the GNU Lesser General +// Public License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// The full text of the license can be found in the file LICENSE.md at +// the top level directory of deal.II. +// +// Authors: Peter Munch, Martin Kronbichler +// +// --------------------------------------------------------------------- + +// deal.II includes +#include +#include +#include +#include + +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include + +// boost +#include + +#include + +// include operators +#include "bps-ceed.h" +#include "bps-kokkos.h" + +// Test cases +//TESTARGS(name="BP1") --resource {ceed_resource} --bp BP1 --fe_degree 2 --print_timings 0 +//TESTARGS(name="BP4") --resource {ceed_resource} --bp BP4 --fe_degree 1 --print_timings 0 + +/** + * Relevant parameters. + */ +struct Parameters +{ + BPType bp = BPType::BP5; + unsigned int n_global_refinements = 1; + unsigned int fe_degree = 2; + bool print_timings = true; + std::string libCEED_resource = "/cpu/self"; + + bool + parse(int argc, char *argv[]) + { + if (argc == 1 && (std::string(argv[0]) == "--help")) + { + std::cout << "Usage: ./bp [OPTION]..." << std::endl; + std::cout << std::endl; + std::cout << "--bp name of benchmark (BP1-BP6)" << std::endl; + std::cout << "--n_refinements number of refinements (0-)" << std::endl; + std::cout << "--fe_degree polynomial degree (1-)" << std::endl; + std::cout << "--print_timings name of benchmark (0, 1)" << std::endl; + std::cout << "--resource name of resource (e.g., /cpu/self/avx/blocked)" << std::endl; + + return true; + } + + AssertThrow(argc % 2 == 0, ExcInternalError()); + + while (argc > 0) + { + std::string label(argv[0]); + + if ("--bp" == label) + { + std::string bp_string(argv[1]); + + if (bp_string == "BP1") + bp = BPType::BP1; + else if (bp_string == "BP2") + bp = BPType::BP2; + else if (bp_string == "BP3") + bp = BPType::BP3; + else if (bp_string == "BP4") + bp = BPType::BP4; + else if (bp_string == "BP5") + bp = BPType::BP5; + else if (bp_string == "BP6") + bp = BPType::BP6; + else + AssertThrow(false, ExcInternalError()); + } + else if ("--n_refinements" == label) + { + n_global_refinements = std::atoi(argv[1]); + } + else if ("--fe_degree" == label) + { + fe_degree = std::atoi(argv[1]); + } + else if ("--print_timings" == label) + { + print_timings = std::atoi(argv[1]); + } + else if ("--resource" == label) + { + libCEED_resource = std::string(argv[1]); + } + else + { + AssertThrow(false, ExcNotImplemented()); + } + + + argc -= 2; + argv += 2; + } + + return false; + } +}; + + + +int +main(int argc, char *argv[]) +{ + Utilities::MPI::MPI_InitFinalize mpi_initialization(argc, argv, 1); + + Parameters params; + if (params.parse(argc - 1, argv + 1)) + return 0; + + ConditionalOStream pout(std::cout, Utilities::MPI::this_mpi_process(MPI_COMM_WORLD) == 0); + + // configuration + const BPType bp = params.bp; + + using Number = double; + using VectorType = LinearAlgebra::distributed::Vector; + const unsigned int dim = 2; + const unsigned int fe_degree = params.fe_degree; + const unsigned int n_q_points = (bp <= BPType::BP4) ? (fe_degree + 2) : (fe_degree + 1); + const unsigned int n_refinements = params.n_global_refinements; + const unsigned int n_components = + (bp == BPType::BP1 || bp == BPType::BP3 || bp == BPType::BP5) ? 1 : dim; + + // create mapping, quadrature, fe, mesh, ... + MappingQ1 mapping; + QGauss quadrature(n_q_points); + FESystem fe(FE_Q(fe_degree), n_components); + +#ifdef DEAL_II_WITH_P4EST + parallel::distributed::Triangulation tria(MPI_COMM_WORLD); +#else + Triangulation tria; +#endif + + GridGenerator::hyper_cube(tria); + tria.refine_global(n_refinements); + + DoFHandler dof_handler(tria); + dof_handler.distribute_dofs(fe); + + DoFRenumbering::support_point_wise(dof_handler); + + AffineConstraints constraints; + + if (!(bp == BPType::BP1 || bp == BPType::BP2)) + { + // for stiffness matrix + DoFTools::make_zero_boundary_constraints(dof_handler, constraints); + constraints.close(); + } + + const auto test = [&](const std::string &label, const auto &op) { + (void)label; + + // initialize vector + VectorType u, v; + op.initialize_dof_vector(u); + op.initialize_dof_vector(v); + u = 1.0; + + constraints.set_zero(u); + + // perform matrix-vector product + op.vmult(v, u); + + // create solver + ReductionControl reduction_control(100, 1e-20, 1e-6); + + // create preconditioner + DiagonalMatrix diagonal_matrix; + op.compute_inverse_diagonal(diagonal_matrix.get_vector()); + + std::chrono::time_point now; + + bool not_converged = false; + + try + { + // solve problem + SolverCG solver(reduction_control); + now = std::chrono::system_clock::now(); + solver.solve(op, v, u, diagonal_matrix); + } + catch (const SolverControl::NoConvergence &) + { + pout << "Error: solver failed to converge with" << std::endl; + not_converged = true; + } + + + const auto time = + std::chrono::duration_cast(std::chrono::system_clock::now() - now) + .count() / + 1e9; + + + if (params.print_timings || not_converged) + { + pout << label << ": " << reduction_control.last_step() << " " << v.l2_norm() << " " + << (params.print_timings ? time : 0.0) << std::endl; + } + }; + + // create and test the libCEED operator + OperatorCeed op_ceed( + mapping, dof_handler, constraints, quadrature, bp, params.libCEED_resource); + test("ceed", op_ceed); + + // create and test a native deal.II operator + OperatorDealii op_dealii(mapping, dof_handler, constraints, quadrature, bp); + test("dealii", op_dealii); +} diff --git a/examples/deal.II/bps-kokkos.h b/examples/deal.II/bps-kokkos.h new file mode 100644 index 0000000000..bd8ba4f54f --- /dev/null +++ b/examples/deal.II/bps-kokkos.h @@ -0,0 +1,327 @@ +// --------------------------------------------------------------------- +// +// Copyright (C) 2023 by the deal.II authors +// +// This file is part of the deal.II library. +// +// The deal.II library is free software; you can use it, redistribute +// it, and/or modify it under the terms of the GNU Lesser General +// Public License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// The full text of the license can be found in the file LICENSE.md at +// the top level directory of deal.II. +// +// Authors: Peter Munch, Martin Kronbichler +// +// --------------------------------------------------------------------- + +#pragma once +#ifndef bps_kokkos_h +# define bps_kokkos_h + +// deal.II includes +# include + +# include + +# include + +# include +# include +# include +# include + +// local includes +# include "bps.h" + +using namespace dealii; + + + +template +class OperatorDealiiMassQuad +{ +public: + DEAL_II_HOST_DEVICE void + operator()(Portable::FEEvaluation *fe_eval, + const int q_point) const + { + fe_eval->submit_value(fe_eval->get_value(q_point), q_point); + } +}; + + + +template +class OperatorDealiiLaplaceQuad +{ +public: + DEAL_II_HOST_DEVICE void + operator()(Portable::FEEvaluation *fe_eval, + const int q_point) const + { + fe_eval->submit_gradient(fe_eval->get_gradient(q_point), q_point); + } +}; + + + +template +class OperatorDealiiMassLocal +{ +public: + DEAL_II_HOST_DEVICE void + operator()(const typename Portable::MatrixFree::Data *data, + const Portable::DeviceVector &src, + Portable::DeviceVector &dst) const + { + Portable::FEEvaluation fe_eval(data); + fe_eval.read_dof_values(src); + fe_eval.evaluate(EvaluationFlags::values); + fe_eval.apply_for_each_quad_point( + OperatorDealiiMassQuad()); + fe_eval.integrate(EvaluationFlags::values); + fe_eval.distribute_local_to_global(dst); + } + + static const unsigned int n_local_dofs = Utilities::pow(fe_degree + 1, dim) * n_components; + static const unsigned int n_q_points = Utilities::pow(n_q_points_1d, dim); +}; + + + +template +class OperatorDealiiLaplaceLocal +{ +public: + DEAL_II_HOST_DEVICE void + operator()(const typename Portable::MatrixFree::Data *data, + const Portable::DeviceVector &src, + Portable::DeviceVector &dst) const + { + Portable::FEEvaluation fe_eval(data); + fe_eval.read_dof_values(src); + fe_eval.evaluate(EvaluationFlags::gradients); + fe_eval.apply_for_each_quad_point( + OperatorDealiiLaplaceQuad()); + fe_eval.integrate(EvaluationFlags::gradients); + fe_eval.distribute_local_to_global(dst); + } + + static const unsigned int n_local_dofs = Utilities::pow(fe_degree + 1, dim) * n_components; + static const unsigned int n_q_points = Utilities::pow(n_q_points_1d, dim); +}; + + + +/** + * Operator GPU implementation using deal.II. + */ +template +class OperatorDealii : public OperatorBase +{ +public: + using VectorType = typename OperatorBase::VectorType; + + /** + * Constructor. + */ + OperatorDealii(const Mapping &mapping, + const DoFHandler &dof_handler, + const AffineConstraints &constraints, + const Quadrature &quadrature, + const BPType &bp) + : mapping(mapping) + , dof_handler(dof_handler) + , constraints(constraints) + , quadrature(quadrature) + , bp(bp) + { + reinit(); + } + + /** + * Destructor. + */ + ~OperatorDealii() = default; + + /** + * Initialized internal data structures, particularly, MatrixFree. + */ + void + reinit() override + { + // configure MatrixFree + typename Portable::MatrixFree::AdditionalData additional_data; + + if (bp <= BPType::BP2) // mass matrix + additional_data.mapping_update_flags = update_JxW_values | update_values; + else + additional_data.mapping_update_flags = update_JxW_values | update_gradients; + + // create MatrixFree + AssertThrow(quadrature.is_tensor_product(), ExcNotImplemented()); + matrix_free.reinit( + mapping, dof_handler, constraints, quadrature.get_tensor_basis()[0], additional_data); + } + + /** + * Matrix-vector product. + */ + void + vmult(VectorType &dst, const VectorType &src) const override + { + dst = 0.0; + + const unsigned int n_components = dof_handler.get_fe().n_components(); + const unsigned int fe_degree = dof_handler.get_fe().tensor_degree(); + const unsigned int n_q_points_1d = quadrature.get_tensor_basis()[0].size(); + + if (n_components == 1 && fe_degree == 1 && n_q_points_1d == 2) + this->vmult_internal<1, 1, 2>(dst, src); + else if (n_components == 1 && fe_degree == 2 && n_q_points_1d == 3) + this->vmult_internal<1, 2, 3>(dst, src); + else if (n_components == dim && fe_degree == 1 && n_q_points_1d == 2) + this->vmult_internal(dst, src); + else if (n_components == dim && fe_degree == 2 && n_q_points_1d == 3) + this->vmult_internal(dst, src); + else if (n_components == 1 && fe_degree == 1 && n_q_points_1d == 3) + this->vmult_internal<1, 1, 3>(dst, src); + else if (n_components == 1 && fe_degree == 2 && n_q_points_1d == 4) + this->vmult_internal<1, 2, 4>(dst, src); + else if (n_components == dim && fe_degree == 1 && n_q_points_1d == 3) + this->vmult_internal(dst, src); + else if (n_components == dim && fe_degree == 2 && n_q_points_1d == 4) + this->vmult_internal(dst, src); + else + AssertThrow(false, ExcInternalError()); + + matrix_free.copy_constrained_values(src, dst); + } + + /** + * Initialize vector. + */ + void + initialize_dof_vector(VectorType &vec) const override + { + matrix_free.initialize_dof_vector(vec); + } + + /** + * Compute inverse of diagonal. + */ + void + compute_inverse_diagonal(VectorType &diagonal) const override + { + this->initialize_dof_vector(diagonal); + + const unsigned int n_components = dof_handler.get_fe().n_components(); + const unsigned int fe_degree = dof_handler.get_fe().tensor_degree(); + const unsigned int n_q_points_1d = quadrature.get_tensor_basis()[0].size(); + + if (n_components == 1 && fe_degree == 1 && n_q_points_1d == 2) + this->compute_inverse_diagonal_internal<1, 1, 2>(diagonal); + else if (n_components == 1 && fe_degree == 2 && n_q_points_1d == 3) + this->compute_inverse_diagonal_internal<1, 2, 3>(diagonal); + else if (n_components == dim && fe_degree == 1 && n_q_points_1d == 2) + this->compute_inverse_diagonal_internal(diagonal); + else if (n_components == dim && fe_degree == 2 && n_q_points_1d == 3) + this->compute_inverse_diagonal_internal(diagonal); + else if (n_components == 1 && fe_degree == 1 && n_q_points_1d == 3) + this->compute_inverse_diagonal_internal<1, 1, 3>(diagonal); + else if (n_components == 1 && fe_degree == 2 && n_q_points_1d == 4) + this->compute_inverse_diagonal_internal<1, 2, 4>(diagonal); + else if (n_components == dim && fe_degree == 1 && n_q_points_1d == 3) + this->compute_inverse_diagonal_internal(diagonal); + else if (n_components == dim && fe_degree == 2 && n_q_points_1d == 4) + this->compute_inverse_diagonal_internal(diagonal); + else + AssertThrow(false, ExcInternalError()); + } + +private: + /** + * Templated vmult function. + */ + template + void + vmult_internal(VectorType &dst, const VectorType &src) const + { + if (bp <= BPType::BP2) // mass matrix + { + OperatorDealiiMassLocal mass_operator; + matrix_free.cell_loop(mass_operator, src, dst); + } + else + { + OperatorDealiiLaplaceLocal + local_operator; + matrix_free.cell_loop(local_operator, src, dst); + } + } + + /** + * Templated compute_inverse_diagonal function. + */ + template + void + compute_inverse_diagonal_internal(VectorType &diagonal) const + { + if (bp <= BPType::BP2) // mass matrix + { + OperatorDealiiMassQuad op_quad; + + MatrixFreeTools::compute_diagonal( + matrix_free, diagonal, op_quad, EvaluationFlags::values, EvaluationFlags::values); + } + else + { + OperatorDealiiLaplaceQuad op_quad; + + MatrixFreeTools::compute_diagonal( + matrix_free, diagonal, op_quad, EvaluationFlags::gradients, EvaluationFlags::gradients); + } + + + Number *diagonal_ptr = diagonal.get_values(); + + Kokkos::parallel_for( + "lethe::invert_vector", + Kokkos::RangePolicy( + 0, diagonal.locally_owned_size()), + KOKKOS_LAMBDA(int i) { diagonal_ptr[i] = 1.0 / diagonal_ptr[i]; }); + } + + /** + * Mapping object passed to the constructor. + */ + const Mapping &mapping; + + /** + * DoFHandler object passed to the constructor. + */ + const DoFHandler &dof_handler; + + /** + * Constraints object passed to the constructor. + */ + const AffineConstraints &constraints; + + /** + * Quadrature rule object passed to the constructor. + */ + const Quadrature &quadrature; + + /** + * Selected BP. + */ + const BPType bp; + + /** + * MatrixFree object. + */ + Portable::MatrixFree matrix_free; +}; + +#endif diff --git a/examples/deal.II/bps-qfunctions.h b/examples/deal.II/bps-qfunctions.h index 6161fdf840..b6a0c498c7 100644 --- a/examples/deal.II/bps-qfunctions.h +++ b/examples/deal.II/bps-qfunctions.h @@ -15,7 +15,7 @@ // // --------------------------------------------------------------------- -#include +#include diff --git a/examples/deal.II/bps.h b/examples/deal.II/bps.h index 677ed1a81f..b7d28919bc 100644 --- a/examples/deal.II/bps.h +++ b/examples/deal.II/bps.h @@ -15,24 +15,25 @@ // // --------------------------------------------------------------------- +#pragma once +#ifndef bps_h +# define bps_h + // deal.II includes -#include +# include -#include +# include -#include +# include -#include -#include -#include +# include +# include +# include +# include -// libCEED includes -#include +using namespace dealii; -// QFunction source -#include "bps-qfunctions.h" -using namespace dealii; /** * BP types. For more details, see https://ceed.exascaleproject.org/bps/. @@ -92,14 +93,14 @@ struct BPInfo /** * Base class of operators. */ -template +template class OperatorBase { public: /** * deal.II vector type */ - using VectorType = LinearAlgebra::distributed::Vector; + using VectorType = LinearAlgebra::distributed::Vector; /** * Initialize vector. @@ -126,766 +127,4 @@ class OperatorBase compute_inverse_diagonal(VectorType &diagonal) const = 0; }; - -/** - * Operator implementation using libCEED. - */ -template -class OperatorCeed : public OperatorBase -{ -public: - using VectorType = typename OperatorBase::VectorType; - - /** - * Constructor. - */ - OperatorCeed(const Mapping &mapping, - const DoFHandler &dof_handler, - const AffineConstraints &constraints, - const Quadrature &quadrature, - const BPType &bp, - const std::string &resource) - : mapping(mapping) - , dof_handler(dof_handler) - , constraints(constraints) - , quadrature(quadrature) - , bp(bp) - , resource(resource) - { - reinit(); - } - - /** - * Destructor. - */ - ~OperatorCeed() - { - CeedOperatorDestroy(&op_apply); - CeedQFunctionDestroy(&qf_apply); - CeedQFunctionContextDestroy(&build_ctx); - CeedVectorDestroy(&q_data); - CeedElemRestrictionDestroy(&q_data_restriction); - CeedElemRestrictionDestroy(&sol_restriction); - CeedBasisDestroy(&sol_basis); - CeedDestroy(&ceed); - } - - /** - * Initialized internal data structures, particularly, libCEED. - */ - void - reinit() override - { - const auto &tria = dof_handler.get_triangulation(); - const auto &fe = dof_handler.get_fe(); - - const auto n_components = fe.n_components(); - - if (bp == BPType::BP1 || bp == BPType::BP3 || bp == BPType::BP5) - { - AssertThrow(n_components == 1, ExcInternalError()); - } - else - { - AssertThrow(n_components == dim, ExcInternalError()); - } - - // 1) create CEED instance -> "MatrixFree" - const char *ceed_spec = resource.c_str(); - CeedInit(ceed_spec, &ceed); - - // 2) create shape functions -> "ShapeInfo" - const unsigned int fe_degree = fe.tensor_degree(); - const unsigned int n_q_points = quadrature.get_tensor_basis()[0].size(); - CeedBasisCreateTensorH1Lagrange( - ceed, dim, n_components, fe_degree + 1, n_q_points, CEED_GAUSS, &sol_basis); - - // 3) create restriction matrix -> DoFInfo - unsigned int n_local_active_cells = 0; - - for (const auto &cell : dof_handler.active_cell_iterators()) - if (cell->is_locally_owned()) - n_local_active_cells++; - - partitioner = - std::make_shared(dof_handler.locally_owned_dofs(), - DoFTools::extract_locally_active_dofs( - dof_handler), - dof_handler.get_communicator()); - - std::vector indices; - indices.reserve(n_local_active_cells * fe.n_dofs_per_cell() / n_components); - - const auto dof_mapping = FETools::lexicographic_to_hierarchic_numbering(fe_degree); - - std::vector local_indices(fe.n_dofs_per_cell()); - - for (const auto &cell : dof_handler.active_cell_iterators()) - if (cell->is_locally_owned()) - { - cell->get_dof_indices(local_indices); - - for (const auto i : dof_mapping) - indices.emplace_back( - partitioner->global_to_local(local_indices[fe.component_to_system_index(0, i)]) / - n_components); - } - - CeedElemRestrictionCreate(ceed, - n_local_active_cells, - fe.n_dofs_per_cell() / n_components, - n_components, - std::max(this->extended_local_size() / n_components, 1), - this->extended_local_size(), - CEED_MEM_HOST, - CEED_COPY_VALUES, - indices.data(), - &sol_restriction); - - // 4) create mapping -> MappingInfo - const unsigned int n_components_metric = (bp <= BPType::BP2) ? 1 : (dim * (dim + 1) / 2); - - this->weights = compute_metric_data(ceed, mapping, tria, quadrature, bp); - - strides = {{1, - static_cast(quadrature.size()), - static_cast(quadrature.size() * n_components_metric)}}; - CeedVectorCreate(ceed, weights.size(), &q_data); - CeedVectorSetArray(q_data, CEED_MEM_HOST, CEED_USE_POINTER, weights.data()); - CeedElemRestrictionCreateStrided(ceed, - n_local_active_cells, - quadrature.size(), - n_components_metric, - weights.size(), - strides.data(), - &q_data_restriction); - - build_ctx_data.dim = dim; - build_ctx_data.space_dim = dim; - - CeedQFunctionContextCreate(ceed, &build_ctx); - CeedQFunctionContextSetData( - build_ctx, CEED_MEM_HOST, CEED_USE_POINTER, sizeof(build_ctx_data), &build_ctx_data); - - // 5) create q operation - if (bp == BPType::BP1) - CeedQFunctionCreateInterior(ceed, 1, f_apply_mass, f_apply_mass_loc, &qf_apply); - else if (bp == BPType::BP2) - CeedQFunctionCreateInterior(ceed, 1, f_apply_mass_vec, f_apply_mass_vec_loc, &qf_apply); - else if (bp == BPType::BP3 || bp == BPType::BP5) - CeedQFunctionCreateInterior(ceed, 1, f_apply_poisson, f_apply_poisson_loc, &qf_apply); - else if (bp == BPType::BP4 || bp == BPType::BP6) - CeedQFunctionCreateInterior(ceed, 1, f_apply_poisson_vec, f_apply_poisson_vec_loc, &qf_apply); - else - AssertThrow(false, ExcInternalError()); - - if (bp <= BPType::BP2) - CeedQFunctionAddInput(qf_apply, "u", n_components, CEED_EVAL_INTERP); - else - CeedQFunctionAddInput(qf_apply, "u", dim * n_components, CEED_EVAL_GRAD); - - CeedQFunctionAddInput(qf_apply, "qdata", n_components_metric, CEED_EVAL_NONE); - - if (bp <= BPType::BP2) - CeedQFunctionAddOutput(qf_apply, "v", n_components, CEED_EVAL_INTERP); - else - CeedQFunctionAddOutput(qf_apply, "v", dim * n_components, CEED_EVAL_GRAD); - - CeedQFunctionSetContext(qf_apply, build_ctx); - - // 6) put everything together - CeedOperatorCreate(ceed, qf_apply, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_apply); - - CeedOperatorSetField(op_apply, "u", sol_restriction, sol_basis, CEED_VECTOR_ACTIVE); - CeedOperatorSetField(op_apply, "qdata", q_data_restriction, CEED_BASIS_NONE, q_data); - CeedOperatorSetField(op_apply, "v", sol_restriction, sol_basis, CEED_VECTOR_ACTIVE); - } - - /** - * Perform matrix-vector product. - */ - void - vmult(VectorType &dst, const VectorType &src) const override - { - // communicate: update ghost values - src.update_ghost_values(); - - if (dof_handler.get_fe().n_components() == 1) - { - // create libCEED view on deal.II vectors - VectorTypeCeed src_ceed(ceed, src); - VectorTypeCeed dst_ceed(ceed, dst); - - // apply operator - CeedOperatorApply(op_apply, src_ceed(), dst_ceed(), CEED_REQUEST_IMMEDIATE); - } - else // TODO: needed for multiple components - { - // allocate space for block vectors - src_tmp.reinit(this->extended_local_size(), true); - dst_tmp.reinit(this->extended_local_size(), true); - - copy_to_block_vector(src_tmp, src); // copy to block vector - - // create libCEED view on deal.II vectors - VectorTypeCeed src_ceed(ceed, src_tmp); - VectorTypeCeed dst_ceed(ceed, dst_tmp); - - // apply operator - CeedOperatorApply(op_apply, src_ceed(), dst_ceed(), CEED_REQUEST_IMMEDIATE); - - dst_ceed.sync_to_host(); // pull libCEED data back to host - copy_from_block_vector(dst, dst_tmp); // copy from block vector - } - - // communicate: compress - src.zero_out_ghost_values(); - dst.compress(VectorOperation::add); - - // apply constraints: we assume homogeneous DBC - constraints.set_zero(dst); - } - - /** - * Initialized vector. - */ - void - initialize_dof_vector(VectorType &vec) const override - { - vec.reinit(partitioner); - } - - /** - * Compute inverse of diagonal. - */ - void - compute_inverse_diagonal(VectorType &diagonal) const override - { - this->initialize_dof_vector(diagonal); - - VectorTypeCeed diagonal_ceed(ceed, diagonal); - - CeedOperatorLinearAssembleDiagonal(op_apply, diagonal_ceed(), CEED_REQUEST_IMMEDIATE); - - const unsigned int n_components = dof_handler.get_fe().n_components(); - - if (n_components > 1) // TODO: needed for multiple components - { - VectorType tmp(diagonal); - - copy_from_block_vector(tmp, diagonal); - - std::swap(tmp, diagonal); - } - - diagonal.compress(VectorOperation::add); - - for (auto &i : diagonal) - i = (std::abs(i) > 1.0e-10) ? (1.0 / i) : 1.0; - } - -private: - /** - * Wrapper around a deal.II vector to create a libCEED vector view. - */ - class VectorTypeCeed - { - public: - /** - * Constructor. - */ - VectorTypeCeed(const Ceed &ceed, const VectorType &vec) - { - const unsigned int n_dofs = - vec.get_partitioner()->locally_owned_size() + vec.get_partitioner()->n_ghost_indices(); - - CeedVectorCreate(ceed, n_dofs, &vec_ceed); - CeedVectorSetArray(vec_ceed, CEED_MEM_HOST, CEED_USE_POINTER, vec.get_values()); - } - - /** - * Return libCEED vector view. - */ - CeedVector & - operator()() - { - return vec_ceed; - } - - /** - * Sync memory from device to host. - */ - void - sync_to_host() - { - CeedVectorSyncArray(vec_ceed, CEED_MEM_HOST); - } - - /** - * Destructor: destroy vector view. - */ - ~VectorTypeCeed() - { - CeedScalar *ptr; - CeedVectorTakeArray(vec_ceed, CEED_MEM_HOST, &ptr); - CeedVectorDestroy(&vec_ceed); - } - - private: - /** - * libCEED vector view. - */ - CeedVector vec_ceed; - }; - - /** - * Copy from block vector. - * - * @note Only needed for multiple components. - */ - void - copy_from_block_vector(VectorType &dst, const VectorType &src) const - { - const unsigned int scalar_size = this->extended_local_size() / dim; - - for (unsigned int i = 0; i < scalar_size; ++i) - for (unsigned int j = 0; j < dim; ++j) - dst.get_values()[j + i * dim] = src.get_values()[j * scalar_size + i]; - } - - /** - * Copy to block vector. - * - * @note Only needed for multiple components. - */ - void - copy_to_block_vector(VectorType &dst, const VectorType &src) const - { - const unsigned int scalar_size = this->extended_local_size() / dim; - - for (unsigned int i = 0; i < scalar_size; ++i) - for (unsigned int j = 0; j < dim; ++j) - dst.get_values()[j * scalar_size + i] = src.get_values()[j + i * dim]; - } - - /** - * Number of locally active DoFs. - */ - unsigned int - extended_local_size() const - { - return partitioner->locally_owned_size() + partitioner->n_ghost_indices(); - } - - /** - * Compute metric data: Jacobian, ... - */ - static std::vector - compute_metric_data(const Ceed &ceed, - const Mapping &mapping, - const Triangulation &tria, - const Quadrature &quadrature, - const BPType bp) - { - std::vector weights; - - if (false) - { - FE_Nothing dummy_fe; - FEValues fe_values(mapping, dummy_fe, quadrature, update_JxW_values); - - for (const auto &cell : tria.active_cell_iterators()) - if (cell->is_locally_owned()) - { - fe_values.reinit(cell); - - for (const auto q : fe_values.quadrature_point_indices()) - weights.emplace_back(fe_values.JxW(q)); - } - - return weights; - } - - CeedBasis geo_basis; - CeedVector q_data; - CeedElemRestriction q_data_restriction; - CeedVector node_coords; - CeedElemRestriction geo_restriction; - CeedQFunctionContext build_ctx; - CeedQFunction qf_build; - CeedOperator op_build; - - const unsigned int n_q_points = quadrature.get_tensor_basis()[0].size(); - - const unsigned int n_components = (bp <= BPType::BP2) ? 1 : (dim * (dim + 1) / 2); - - const auto mapping_q = dynamic_cast *>(&mapping); - - AssertThrow(mapping_q, ExcMessage("Wrong mapping!")); - - const unsigned int fe_degree = mapping_q->get_degree(); - - CeedBasisCreateTensorH1Lagrange( - ceed, dim, dim, fe_degree + 1, n_q_points, CEED_GAUSS, &geo_basis); - - unsigned int n_local_active_cells = 0; - - for (const auto &cell : tria.active_cell_iterators()) - if (cell->is_locally_owned()) - n_local_active_cells++; - - std::vector geo_support_points; - std::vector geo_indices; - - FE_Q geo_fe(fe_degree); - - DoFHandler geo_dof_handler(tria); - geo_dof_handler.distribute_dofs(geo_fe); - - const auto geo_partitioner = - std::make_shared(geo_dof_handler.locally_owned_dofs(), - DoFTools::extract_locally_active_dofs( - geo_dof_handler), - geo_dof_handler.get_communicator()); - - geo_indices.reserve(n_local_active_cells * geo_fe.n_dofs_per_cell()); - - const auto dof_mapping = FETools::lexicographic_to_hierarchic_numbering(fe_degree); - - FEValues fe_values(mapping, - geo_fe, - geo_fe.get_unit_support_points(), - update_quadrature_points); - - std::vector local_indices(geo_fe.n_dofs_per_cell()); - - const unsigned int n_points = - geo_partitioner->locally_owned_size() + geo_partitioner->n_ghost_indices(); - - geo_support_points.resize(dim * n_points); - - for (const auto &cell : geo_dof_handler.active_cell_iterators()) - if (cell->is_locally_owned()) - { - fe_values.reinit(cell); - cell->get_dof_indices(local_indices); - - for (const auto i : dof_mapping) - { - const auto index = geo_partitioner->global_to_local(local_indices[i]); - geo_indices.emplace_back(index); - - const auto point = fe_values.quadrature_point(i); - - for (unsigned int d = 0; d < dim; ++d) - geo_support_points[index + d * n_points] = point[d]; - } - } - - weights.resize(n_local_active_cells * quadrature.size() * n_components); - - CeedInt strides[3] = {1, - static_cast(quadrature.size()), - static_cast(quadrature.size() * n_components)}; - - CeedVectorCreate(ceed, weights.size(), &q_data); - CeedVectorSetArray(q_data, CEED_MEM_HOST, CEED_USE_POINTER, weights.data()); - CeedElemRestrictionCreateStrided(ceed, - n_local_active_cells, - quadrature.size(), - n_components, - weights.size(), - strides, - &q_data_restriction); - - CeedVectorCreate(ceed, geo_support_points.size(), &node_coords); - CeedVectorSetArray(node_coords, CEED_MEM_HOST, CEED_USE_POINTER, geo_support_points.data()); - - CeedElemRestrictionCreate(ceed, - n_local_active_cells, - geo_fe.n_dofs_per_cell(), - dim, - std::max(geo_support_points.size() / dim, 1), - geo_support_points.size(), - CEED_MEM_HOST, - CEED_COPY_VALUES, - geo_indices.data(), - &geo_restriction); - - BuildContext build_ctx_data; - build_ctx_data.dim = dim; - build_ctx_data.space_dim = dim; - - CeedQFunctionContextCreate(ceed, &build_ctx); - CeedQFunctionContextSetData( - build_ctx, CEED_MEM_HOST, CEED_USE_POINTER, sizeof(build_ctx_data), &build_ctx_data); - - // 5) create q operation - if (bp <= BPType::BP2) - CeedQFunctionCreateInterior(ceed, 1, f_build_mass, f_build_mass_loc, &qf_build); - else - CeedQFunctionCreateInterior(ceed, 1, f_build_poisson, f_build_poisson_loc, &qf_build); - - CeedQFunctionAddInput(qf_build, "geo", dim * dim, CEED_EVAL_GRAD); - CeedQFunctionAddInput(qf_build, "weights", 1, CEED_EVAL_WEIGHT); - CeedQFunctionAddOutput(qf_build, "qdata", n_components, CEED_EVAL_NONE); - CeedQFunctionSetContext(qf_build, build_ctx); - - // 6) put everything together - CeedOperatorCreate(ceed, qf_build, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_build); - CeedOperatorSetField(op_build, "geo", geo_restriction, geo_basis, CEED_VECTOR_ACTIVE); - CeedOperatorSetField( - op_build, "weights", CEED_ELEMRESTRICTION_NONE, geo_basis, CEED_VECTOR_NONE); - CeedOperatorSetField( - op_build, "qdata", q_data_restriction, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE); - - CeedOperatorApply(op_build, node_coords, q_data, CEED_REQUEST_IMMEDIATE); - - CeedOperatorDestroy(&op_build); - CeedQFunctionDestroy(&qf_build); - CeedQFunctionContextDestroy(&build_ctx); - CeedElemRestrictionDestroy(&geo_restriction); - CeedVectorDestroy(&node_coords); - CeedElemRestrictionDestroy(&q_data_restriction); - CeedVectorSyncArray(q_data, CEED_MEM_HOST); - CeedVectorDestroy(&q_data); - CeedBasisDestroy(&geo_basis); - - return weights; - } - - /** - * Mapping object passed to the constructor. - */ - const Mapping &mapping; - - /** - * DoFHandler object passed to the constructor. - */ - const DoFHandler &dof_handler; - - /** - * Constraints object passed to the constructor. - */ - const AffineConstraints &constraints; - - /** - * Quadrature rule object passed to the constructor. - */ - const Quadrature &quadrature; - - /** - * Selected BP. - */ - const BPType bp; - - /** - * Resource name. - */ - const std::string resource; - - /** - * Partitioner for distributed vectors. - */ - std::shared_ptr partitioner; - - /** - * libCEED data structures. - */ - Ceed ceed; - CeedBasis sol_basis; - CeedElemRestriction sol_restriction; - CeedElemRestriction q_data_restriction; - std::vector weights; - CeedVector q_data; - std::array strides; - BuildContext build_ctx_data; - CeedQFunctionContext build_ctx; - CeedQFunction qf_apply; - CeedOperator op_apply; - - /** - * Temporal (tempral) vectors. - * - * @note Only needed for multiple components. - */ - mutable VectorType src_tmp; - mutable VectorType dst_tmp; -}; - - - -template -class OperatorDealii : public OperatorBase -{ -public: - using VectorType = typename OperatorBase::VectorType; - - /** - * Constructor. - */ - OperatorDealii(const Mapping &mapping, - const DoFHandler &dof_handler, - const AffineConstraints &constraints, - const Quadrature &quadrature, - const BPType &bp) - : mapping(mapping) - , dof_handler(dof_handler) - , constraints(constraints) - , quadrature(quadrature) - , bp(bp) - { - reinit(); - } - - /** - * Destructor. - */ - ~OperatorDealii() = default; - - /** - * Initialized internal data structures, particularly, MatrixFree. - */ - void - reinit() override - { - // configure MatrixFree - typename MatrixFree::AdditionalData additional_data; - additional_data.tasks_parallel_scheme = - MatrixFree::AdditionalData::TasksParallelScheme::none; - - // create MatrixFree - matrix_free.reinit(mapping, dof_handler, constraints, quadrature, additional_data); - } - - /** - * Matrix-vector product. - */ - void - vmult(VectorType &dst, const VectorType &src) const override - { - if (dof_handler.get_fe().n_components() == 1) - { - matrix_free.cell_loop(&OperatorDealii::do_cell_integral_range<1>, this, dst, src, true); - } - else - { - AssertThrow(dof_handler.get_fe().n_components() == dim, ExcInternalError()); - - matrix_free.cell_loop(&OperatorDealii::do_cell_integral_range, this, dst, src, true); - } - } - - /** - * Initialize vector. - */ - void - initialize_dof_vector(VectorType &vec) const override - { - matrix_free.initialize_dof_vector(vec); - } - - /** - * Compute inverse of diagonal. - */ - void - compute_inverse_diagonal(VectorType &diagonal) const override - { - this->initialize_dof_vector(diagonal); - - if (dof_handler.get_fe().n_components() == 1) - { - MatrixFreeTools::compute_diagonal(matrix_free, - diagonal, - &OperatorDealii::do_cell_integral_local<1>, - this); - } - else - { - AssertThrow(dof_handler.get_fe().n_components() == dim, ExcInternalError()); - - MatrixFreeTools::compute_diagonal(matrix_free, - diagonal, - &OperatorDealii::do_cell_integral_local, - this); - } - - for (auto &i : diagonal) - i = (std::abs(i) > 1.0e-10) ? (1.0 / i) : 1.0; - } - -private: - /** - * Cell integral without vector access. - */ - template - void - do_cell_integral_local(FEEvaluation &phi) const - { - if (bp <= BPType::BP2) // mass matrix - { - phi.evaluate(EvaluationFlags::values); - for (const auto q : phi.quadrature_point_indices()) - phi.submit_value(phi.get_value(q), q); - phi.integrate(EvaluationFlags::values); - } - else // Poisson operator - { - phi.evaluate(EvaluationFlags::gradients); - for (const auto q : phi.quadrature_point_indices()) - phi.submit_gradient(phi.get_gradient(q), q); - phi.integrate(EvaluationFlags::gradients); - } - } - - /** - * Cell integral on a range of cells. - */ - template - void - do_cell_integral_range(const MatrixFree &matrix_free, - VectorType &dst, - const VectorType &src, - const std::pair &range) const - { - FEEvaluation phi(matrix_free, range); - - for (unsigned cell = range.first; cell < range.second; ++cell) - { - phi.reinit(cell); - phi.read_dof_values(src); // read source vector - do_cell_integral_local(phi); // cell integral - phi.distribute_local_to_global(dst); // write to destination vector - } - } - - /** - * Mapping object passed to the constructor. - */ - const Mapping &mapping; - - /** - * DoFHandler object passed to the constructor. - */ - const DoFHandler &dof_handler; - - /** - * Constraints object passed to the constructor. - */ - const AffineConstraints &constraints; - - /** - * Quadrature rule object passed to the constructor. - */ - const Quadrature &quadrature; - - /** - * Selected BP. - */ - const BPType bp; - - /** - * MatrixFree object. - */ - MatrixFree matrix_free; -}; +#endif diff --git a/examples/fluids/Makefile b/examples/fluids/Makefile index f5a1d7c8c0..c99a63a0b3 100644 --- a/examples/fluids/Makefile +++ b/examples/fluids/Makefile @@ -23,11 +23,8 @@ PETSc.pc := $(PETSC_DIR)/$(PETSC_ARCH)/lib/pkgconfig/PETSc.pc CEED_DIR ?= ../.. ceed.pc := $(CEED_DIR)/lib/pkgconfig/ceed.pc -# ASAN must be left empty if you don't want to use it -ASAN ?= - CC = $(call pkgconf, --variable=ccompiler $(PETSc.pc) $(ceed.pc)) -CFLAGS = -std=c99 \ +CFLAGS = -std=c11 \ $(call pkgconf, --variable=cflags_extra $(PETSc.pc)) \ $(call pkgconf, --cflags-only-other $(PETSc.pc)) \ $(OPT) $(OPT_EXAMPLES) @@ -37,12 +34,17 @@ LDFLAGS = $(call pkgconf, --libs-only-L --libs-only-other $(PETSc.pc) $(ceed.pc) LDFLAGS += $(patsubst -L%, $(call pkgconf, --variable=ldflag_rpath $(PETSc.pc))%, $(call pkgconf, --libs-only-L $(PETSc.pc) $(ceed.pc))) LDLIBS = $(call pkgconf, --libs-only-l $(PETSc.pc) $(ceed.pc)) -lm -AFLAGS ?= -fsanitize=address #-fsanitize=undefined -fno-omit-frame-pointer +# Address Sanitizer Setup +# ASAN must be left empty if you don't want to use it +ASAN ?= +AFLAGS ?= -fsanitize=address +# Also: -fsanitize=undefined -fno-omit-frame-pointer CFLAGS += $(if $(ASAN),$(AFLAGS)) FFLAGS += $(if $(ASAN),$(AFLAGS)) LDFLAGS += $(if $(ASAN),$(AFLAGS)) CPPFLAGS += -I./include +# Source Files OBJDIR := build SRCDIR := src PROBLEMDIR := problems @@ -50,24 +52,10 @@ PROBLEMDIR := problems src.c := navierstokes.c $(sort $(wildcard $(PROBLEMDIR)/*.c)) $(sort $(wildcard $(SRCDIR)/*.c)) src.o = $(src.c:%.c=$(OBJDIR)/%.o) -# Path to install directory for SmartRedis. Example: /software/smartredis/install -SMARTREDIS_DIR ?= -ifdef SMARTREDIS_DIR - hiredis.pc := $(SMARTREDIS_DIR)/lib/pkgconfig/hiredis.pc - lsmartredis:= -lsmartredis - redis++.pc = $(wildcard $(SMARTREDIS_DIR)/lib/pkgconfig/redis++.pc $(SMARTREDIS_DIR)/lib64/pkgconfig/redis++.pc) - - CPPFLAGS += $(call pkgconf, --cflags-only-I $(hiredis.pc) $(redis++.pc)) - LDFLAGS += $(call pkgconf, --libs-only-L --libs-only-other $(hiredis.pc) $(redis++.pc)) - LDFLAGS += $(patsubst -L%, $(call pkgconf, --variable=ldflag_rpath $(PETSc.pc))%, $(call pkgconf, --libs-only-L $(hiredis.pc) $(redis++.pc))) - LDLIBS += $(call pkgconf, --libs-only-l $(hiredis.pc) $(redis++.pc)) $(lsmartredis) - src.c += $(sort $(wildcard $(SRCDIR)/smartsim/*.c)) -endif - all: navierstokes navierstokes: $(src.o) | $(PETSc.pc) $(ceed.pc) - $(call quiet,LINK.o) $(CEED_LDFLAGS) $^ $(LOADLIBES) $(LDLIBS) -o $@ + $(call quiet,LINK.o) $(LDFLAGS) $^ $(LOADLIBES) $(LDLIBS) -o $@ .SECONDEXPANSION: # to expand $$(@D)/.DIR %/.DIR : @@ -77,7 +65,7 @@ navierstokes: $(src.o) | $(PETSc.pc) $(ceed.pc) # Quiet, color output quiet ?= $($(1)) -$(OBJDIR)/%.o : %.c | $$(@D)/.DIR +$(OBJDIR)/%.o : %.c Makefile | $$(@D)/.DIR $(call quiet,CC) $(CPPFLAGS) $(CFLAGS) -c -o $@ $(abspath $<) print: $(PETSc.pc) $(ceed.pc) diff --git a/examples/fluids/README.md b/examples/fluids/README.md index 179e3950c8..ec0ea560e1 100644 --- a/examples/fluids/README.md +++ b/examples/fluids/README.md @@ -1,7 +1,8 @@ ## libCEED: Navier-Stokes Example This page provides a description of the Navier-Stokes example for the libCEED library, based on PETSc. -PETSc v3.17 or a development version of PETSc at commit 0e95d842 or later is required. + +HONEE, a more fully featured fluid dynamics solver, can be found on [GitLab](https://gitlab.com/phypid/honee). The Navier-Stokes problem solves the compressible Navier-Stokes equations in three dimensions using an explicit time integration. The state variables are mass density, momentum density, and energy density. @@ -20,15 +21,9 @@ and run with: ./navierstokes -ceed [ceed] -problem [problem type] -degree [degree] ``` -If you want to do *in situ* machine-learning training, specify `SMARTREDIS_DIR` in the make command like: - -``` -make SMARTREDIS_DIR=~/software/smartredis/install -``` - ## Runtime options -% inclusion-fluids-marker + The Navier-Stokes mini-app is controlled via command-line options. The following options are common among all problem types: @@ -615,7 +610,7 @@ For the Density Current, Channel, and Blasius problems, the following common com - boolean * - `-state_var` - - State variables to solve solution with. `conservative` ($\rho, \rho \bm{u}, \rho e$) or `primitive` ($P, \bm{u}, T$) + - State variables to solve solution with. `conservative` ($\rho, \rho \bm{u}, \rho e$), `primitive` ($P, \bm{u}, T$), or `entropy` ($\frac{\gamma - s}{\gamma - 1} - \frac{\rho}{P} (e - c_v T),\ \frac{\rho}{P} \bm{u},\ -\frac{\rho}{P}$) where $s = \ln(P\rho^{-\gamma})$ - `conservative` - string @@ -634,25 +629,10 @@ For the Density Current, Channel, and Blasius problems, the following common com - `0` - `m` -* - `-sgs_model_type` - - Type of subgrid stress model to use. Currently only `data_driven` is available - - `none` - - string - -* - `-sgs_model_dd_leakyrelu_alpha` - - Slope parameter for Leaky ReLU activation function. `0` corresponds to normal ReLU - - 0 - - - -* - `-sgs_model_dd_parameter_dir` - - Path to directory with data-driven model parameters (weights, biases, etc.) - - `./dd_sgs_parameters` - - string - -* - `-sgs_model_dd_use_fused` - - Whether to use "fused" mode for data-driven model evaluation - - `true` - - boolean +* - `-idl_pressure` + - Pressure used for IDL reference pressure + - `-reference_pressure` + - `Pa` * - `-diff_filter_monitor` - Enable differential filter TSMonitor @@ -688,31 +668,6 @@ For the Density Current, Channel, and Blasius problems, the following common com - Friction length associated with the flow, $\delta_\nu$. Used in wall-damping functions - 0 - `m` - -* - `-sgs_train_enable` - - Whether to enable *in situ* training of data-driven SGS model. Require building with SmartRedis. - - `false` - - boolean - -* - `-sgs_train_write_data_interval` - - Number of timesteps between writing training data into SmartRedis database - - `1` - - - -* - `-sgs_train_overwrite_data` - - Whether new training data should overwrite old data on database - - `true` - - boolean - -* - `-sgs_train_filter_widths` - - List of scalar values for different filter widths to calculate for training data - - - - `m` - -* - `-smartsim_collocated_num_ranks` - - Number of MPI ranks associated with each collocated database (i.e. ranks per node) - - `1` - - ::: #### Gaussian Wave @@ -948,6 +903,11 @@ The Blasius problem has the following command-line options in addition to the Ne - `288` - `K` +* - `-pressure_infinity` + - Atmospheric pressure, also sets IDL reference pressure + - `1.01E5` + - `Pa` + * - `-temperature_wall` - Wall temperature - `288` @@ -958,11 +918,6 @@ The Blasius problem has the following command-line options in addition to the Ne - `4.2e-3` - `m` -* - `-P0` - - Atmospheric pressure - - `1.01E5` - - `Pa` - * - `-platemesh_modify_mesh` - Whether to modify the mesh using the given options below. - `false` @@ -1067,6 +1022,16 @@ Using the STG Inflow for the blasius problem adds the following command-line opt - `false` - +* - `-stg_dx` + - Set the element size in the x direction. Default is calculated for box meshes, assuming equispaced elements. + - + - `m` + +* - `-stg_h_scale_factor` + - Scale element size for cutoff frequency calculation + - $1/p$ + - + ::: This problem can be run with the `blasius.yaml` file via: diff --git a/examples/fluids/dd_sgs_data/OutScaling.dat b/examples/fluids/dd_sgs_data/OutScaling.dat deleted file mode 100644 index 94dab73636..0000000000 --- a/examples/fluids/dd_sgs_data/OutScaling.dat +++ /dev/null @@ -1,13 +0,0 @@ -12 1 -0e+00 -2e+00 -0e+00 -2e+00 -0e+00 -2e+00 --1e+00 -1e+00 --1e+00 -1e+00 --1e+00 -1e+00 diff --git a/examples/fluids/dd_sgs_data/b1.dat b/examples/fluids/dd_sgs_data/b1.dat deleted file mode 100644 index 873f658ea9..0000000000 --- a/examples/fluids/dd_sgs_data/b1.dat +++ /dev/null @@ -1,21 +0,0 @@ -20 1 -4.899884770038e-01 -3.563204159517e-01 -2.627287776915e-01 -2.951473061921e-02 --4.622340771977e-01 --1.209842939357e-02 --4.663763370896e-01 -8.796932075820e-02 -4.501638907868e-01 -2.077678516370e-01 --1.139284062351e-01 --3.303352644675e-01 --4.148295154500e-01 --4.833042778786e-02 -2.972372410179e-02 --2.464389991227e-01 --2.877421872362e-01 --4.567405721457e-01 -4.734193646824e-01 --4.818997410080e-01 diff --git a/examples/fluids/dd_sgs_data/b2.dat b/examples/fluids/dd_sgs_data/b2.dat deleted file mode 100644 index 4ff5bd0b30..0000000000 --- a/examples/fluids/dd_sgs_data/b2.dat +++ /dev/null @@ -1,7 +0,0 @@ -6 1 -1.176169920799e-01 --2.134958413350e-01 -1.512851885922e-01 -1.612014419874e-01 --1.437293376985e-02 -2.899547585024e-01 diff --git a/examples/fluids/dd_sgs_data/w1.dat b/examples/fluids/dd_sgs_data/w1.dat deleted file mode 100644 index f27466d680..0000000000 --- a/examples/fluids/dd_sgs_data/w1.dat +++ /dev/null @@ -1,121 +0,0 @@ -120 1 --1.573046615553e-01 --8.451867037896e-02 -1.685678425651e-01 --4.017536901661e-01 -4.068168468515e-01 --1.642360540833e-01 -3.676945195442e-01 -3.470122358933e-02 -8.395344749312e-02 -1.230997497940e-01 -1.211759010593e-02 -2.570394361674e-01 -3.340400131793e-01 -2.342342193371e-01 -3.243180076338e-01 -1.235553459881e-01 --7.501312735230e-03 -1.277232278360e-01 -5.125506452634e-02 -1.844073315749e-02 -3.604786379338e-02 -2.063091161008e-01 -2.406054256905e-01 -1.846018306032e-01 -8.525111135827e-02 -3.795039661101e-01 --5.693426468413e-02 --8.111639981827e-02 -5.813760592106e-02 -1.490767475429e-01 --2.410115518494e-01 -2.173034199970e-01 -1.497734413376e-01 --1.296487298257e-01 -2.078686368723e-01 -3.891951801941e-01 -4.243457269355e-01 -1.735201583344e-02 --8.168373382023e-02 --5.933063216886e-02 -1.886585865778e-01 -1.756465348482e-01 -3.295663670792e-01 -1.056135052370e-01 --2.574613681620e-02 -3.683309291418e-01 -3.263624712033e-01 -8.396039179924e-03 --1.916324382654e-01 --2.628404302745e-01 --4.853315252243e-01 -3.133577858731e-01 --2.225070735939e-01 --9.576593410171e-02 -1.447837018193e-01 -2.479471268180e-01 --2.815934342469e-02 -4.508725076092e-02 -1.597744878041e-01 -3.494916947631e-01 --1.426111236028e-02 --1.950362350157e-01 --1.520383426062e-01 --1.344609935156e-01 --2.834500136985e-01 --1.781729998743e-01 --2.521768488857e-03 -6.124647252338e-02 -1.821655951804e-01 -1.293018729851e-01 --9.152586815194e-02 -1.765147511709e-01 -1.875253937772e-02 -2.166082722554e-01 -2.938824219314e-01 -5.320082811374e-02 -2.741659946012e-02 --2.433400466181e-02 --2.085467015769e-01 --1.562518751071e-01 -1.953718281920e-01 --1.221103203238e-02 --6.595354434769e-03 -1.189039582211e-02 -4.107899017131e-02 --6.139734862958e-02 --1.123938999802e-01 -4.565610032251e-04 --1.740175952284e-01 --1.494514855103e-01 --2.351603953684e-01 -3.606743670982e-02 -8.892241319819e-02 --3.823627099458e-02 --4.888216006000e-02 --2.063411767057e-02 -2.653079299534e-02 -2.183949112550e-01 -2.504859939801e-01 -2.814937134408e-01 -7.415384984914e-01 --1.397471716093e+00 --1.489213014481e+00 --5.251418296160e-01 -1.137088253126e-02 --1.895953497433e+00 -7.674570685028e-02 --4.854041451939e+00 -3.391193043882e+00 -2.707932115838e-01 --2.105949983636e-01 -3.070531949510e+00 --4.339743339556e+00 -1.620039300970e+00 --5.362553981240e+00 --3.777406494431e-01 -6.925621482846e-01 -2.343923900615e+00 --3.371354057465e-01 -2.055037536703e-01 diff --git a/examples/fluids/dd_sgs_data/w2.dat b/examples/fluids/dd_sgs_data/w2.dat deleted file mode 100644 index e1553a6817..0000000000 --- a/examples/fluids/dd_sgs_data/w2.dat +++ /dev/null @@ -1,121 +0,0 @@ -120 1 -1.135291623557e-01 -2.505376613198e-01 --9.772966879924e-02 --3.165730972704e-02 -2.807214492556e-01 -1.905260494013e-01 --2.411145792883e-01 -7.384048966390e-02 --1.125988973598e-01 -2.226653706004e-01 --8.789637173632e-02 -2.422358783658e-02 --1.888415645076e-01 --1.810726479901e-01 --1.820814108385e-01 -2.707856893663e-02 -2.395061686285e-01 -3.132696895911e-01 --3.571137262982e-02 --6.703403319249e-02 --2.135582591703e-02 -1.706671398779e-01 --1.422555292276e-01 --1.599414011627e-01 -5.590818266867e-02 -4.760353849516e-02 --8.990354851525e-02 --2.351533551901e-01 --9.919203877195e-02 --8.730502598066e-02 --1.624083994254e-01 --1.756234871059e-01 -2.155448112826e-03 --2.196716615285e-01 -1.230359555198e-01 --2.487008789866e-01 -1.724010972168e-01 -1.200986779247e-01 --1.356200209136e-01 --7.136175504869e-02 --3.284780361916e-02 --2.809583022011e-01 --4.970314689199e-01 --2.877535188767e-02 --3.486136238658e-01 -1.031508309715e-01 --1.166679199470e-01 -1.560071145323e-01 -2.028477831976e-01 -1.679921757572e-01 -1.107170925328e-01 -3.667441712254e-02 -4.279277543497e-02 -1.742941565737e-01 --3.784073837720e-02 -1.170800846414e-01 --8.476677440525e-02 -1.497150762135e-01 -2.095513599240e-01 -1.824870885809e-01 -4.204566627279e-03 --1.556048882917e-02 -1.383926559619e-02 --3.655393508686e-02 -1.111261215177e-01 --3.069205340750e-04 -3.488581056182e-01 --8.042626832384e-02 -1.033683988755e-01 -5.948803437376e-02 --1.994940978541e-01 -7.096924570423e-03 --5.218607313871e-01 --3.428397293084e-01 --2.293382327216e-01 --1.460950001481e-01 --1.581076721431e-01 --2.289507718293e-01 -9.798627298221e-02 -1.437733340246e-01 -1.419228410529e-01 -1.958229699684e-01 -6.931951694653e-03 --7.136749568601e-02 --4.555582403662e-01 --3.070119242611e-01 --2.470410221827e-01 --7.803738726853e-02 -9.142063556119e-02 --1.368559538361e-02 --1.850283326418e-01 -1.152746119954e-02 -1.638429235964e-01 --1.435165512193e-01 --2.534513849487e-01 --2.984090266181e-01 -2.217432932036e-01 --8.358398540164e-02 -7.406614310444e-02 --5.651017266891e-02 --2.270784064420e-01 --2.302290117375e-01 -9.304265393625e-02 -6.798332878752e-02 -4.431976767864e-02 --1.707610729819e-01 --1.410204520039e-01 -1.327823810929e-01 --6.044012224887e-02 --1.376555083883e-02 --3.025252354651e-03 -1.907005235143e-01 -1.291788250753e-01 -5.697185825588e-02 -5.093944063855e-02 --5.412382470510e-02 -2.268724377069e-01 --7.159129384369e-02 --2.554784469980e-01 --1.335334767520e-01 diff --git a/examples/fluids/include/bc_definition.h b/examples/fluids/include/bc_definition.h new file mode 100644 index 0000000000..7b5671ab1c --- /dev/null +++ b/examples/fluids/include/bc_definition.h @@ -0,0 +1,45 @@ +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. +// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +// +// SPDX-License-Identifier: BSD-2-Clause +// +// This file is part of CEED: http://github.com/ceed +#pragma once + +#include +#include + +typedef struct _p_BCDefinition *BCDefinition; +struct _p_BCDefinition { + char *name; + + // Boundary ID information + PetscInt num_label_values, *label_values, dm_field; + + // Essential Boundary information + PetscInt num_essential_comps, *essential_comps; +}; + +/** + @brief Creates a `BCDefinition` from an array of integers in an option in the database + + Must be between `PetscOptionsBegin()` and `PetscOptionsEnd()`. + + @param[in] opt The option one is seeking + @param[in] text Short string describing option + @param[in] man Manual page for the option + @param[in] name String that sets the name of the `BCDefinition` + @param[out] bc_def Resulting `BCDefinition`, `NULL` if option is not set + @param[out] set `PETSC_TRUE` if found, else `PETSC_FALSE` +**/ +#define PetscOptionsBCDefinition(opt, text, man, name, bc_def, set) \ + PetscOptionsBCDefinition_Private(PetscOptionsObject, opt, text, man, name, bc_def, set) +PetscErrorCode PetscOptionsBCDefinition_Private(PetscOptionItems PetscOptionsObject, const char opt[], const char text[], const char man[], + const char name[], BCDefinition *bc_def, PetscBool *set); + +PetscErrorCode BCDefinitionCreate(const char *name, PetscInt num_label_values, PetscInt label_values[], BCDefinition *bc_def); +PetscErrorCode BCDefinitionGetInfo(BCDefinition bc_def, const char *name[], PetscInt *num_label_values, const PetscInt *label_values[]); +PetscErrorCode BCDefinitionDestroy(BCDefinition *bc_def); + +PetscErrorCode BCDefinitionSetEssential(BCDefinition bc_def, PetscInt num_essential_comps, PetscInt essential_comps[]); +PetscErrorCode BCDefinitionGetEssential(BCDefinition bc_def, PetscInt *num_essential_comps, const PetscInt *essential_comps[]); diff --git a/examples/fluids/include/log_events.h b/examples/fluids/include/log_events.h new file mode 100644 index 0000000000..4a70db3b83 --- /dev/null +++ b/examples/fluids/include/log_events.h @@ -0,0 +1,22 @@ +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. +// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +// +// SPDX-License-Identifier: BSD-2-Clause +// +// This file is part of CEED: http://github.com/ceed +#pragma once + +#include + +extern PetscLogEvent FLUIDS_CeedOperatorApply; +extern PetscLogEvent FLUIDS_CeedOperatorAssemble; +extern PetscLogEvent FLUIDS_CeedOperatorAssembleDiagonal; +extern PetscLogEvent FLUIDS_CeedOperatorAssemblePointBlockDiagonal; +extern PetscLogEvent FLUIDS_SmartRedis_Init; +extern PetscLogEvent FLUIDS_SmartRedis_Meta; +extern PetscLogEvent FLUIDS_SmartRedis_Train; +extern PetscLogEvent FLUIDS_TrainDataCompute; +extern PetscLogEvent FLUIDS_DifferentialFilter; +extern PetscLogEvent FLUIDS_VelocityGradientProjection; + +PetscErrorCode RegisterLogEvents(); diff --git a/examples/fluids/include/mat-ceed-impl.h b/examples/fluids/include/mat-ceed-impl.h index f5d5d9ac6a..67a77b7591 100644 --- a/examples/fluids/include/mat-ceed-impl.h +++ b/examples/fluids/include/mat-ceed-impl.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -7,48 +7,20 @@ #pragma once #include +#include #include #include - -#if defined(__clang_analyzer__) -#define MATCEED_EXTERN extern -#elif defined(__cplusplus) -#define MATCEED_EXTERN extern "C" -#else -#define MATCEED_EXTERN extern -#endif - -#if defined(__clang_analyzer__) -#define MATCEED_INTERN -#else -#define MATCEED_INTERN MATCEED_EXTERN __attribute__((visibility("hidden"))) -#endif - -/** - @brief Calls a libCEED function and then checks the resulting error code. - If the error code is non-zero, then a PETSc error is set with the libCEED error message. -**/ -#ifndef PetscCallCeed -#define PetscCallCeed(ceed_, ...) \ - do { \ - int ierr_q_ = __VA_ARGS__; \ - if (ierr_q_ != CEED_ERROR_SUCCESS) { \ - const char *error_message; \ - CeedGetErrorMessage(ceed_, &error_message); \ - SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "%s", error_message); \ - } \ - } while (0) -#endif +#include // MatCeed context for applying composite CeedOperator on a DM typedef struct MatCeedContext_private *MatCeedContext; struct MatCeedContext_private { Ceed ceed; - char *name, *internal_mat_type; + char *name, *coo_mat_type; PetscMemType mem_type; PetscInt ref_count, num_mats_assembled_full, num_mats_assembled_pbd; PetscBool is_destroyed, is_ceed_pbd_valid, is_ceed_vpbd_valid; - PetscLogEvent log_event_mult, log_event_mult_transpose; + PetscLogEvent log_event_mult, log_event_mult_transpose, log_event_ceed_mult, log_event_ceed_mult_transpose; DM dm_x, dm_y; Mat *mats_assembled_full, *mats_assembled_pbd, mat_assembled_full_internal, mat_assembled_pbd_internal; Vec X_loc, Y_loc_transpose; @@ -58,17 +30,18 @@ struct MatCeedContext_private { }; // Context data -MATCEED_INTERN PetscErrorCode MatCeedContextCreate(DM dm_x, DM dm_y, Vec X_loc, Vec Y_loc_transpose, CeedOperator op_mult, - CeedOperator op_mult_transpose, PetscLogEvent log_event_mult, - PetscLogEvent log_event_mult_transpose, MatCeedContext *ctx); -MATCEED_INTERN PetscErrorCode MatCeedContextReference(MatCeedContext ctx); -MATCEED_INTERN PetscErrorCode MatCeedContextReferenceCopy(MatCeedContext ctx, MatCeedContext *ctx_copy); -MATCEED_INTERN PetscErrorCode MatCeedContextDestroy(MatCeedContext ctx); - -// Mat Ceed -MATCEED_INTERN PetscErrorCode MatGetDiagonal_Ceed(Mat A, Vec D); -MATCEED_INTERN PetscErrorCode MatMult_Ceed(Mat A, Vec X, Vec Y); -MATCEED_INTERN PetscErrorCode MatMultTranspose_Ceed(Mat A, Vec Y, Vec X); +PETSC_CEED_EXTERN PetscErrorCode MatCeedContextCreate(DM dm_x, DM dm_y, Vec X_loc, Vec Y_loc_transpose, CeedOperator op_mult, + CeedOperator op_mult_transpose, PetscLogEvent log_event_mult, + PetscLogEvent log_event_mult_transpose, PetscLogEvent log_event_ceed_mult, + PetscLogEvent log_event_ceed_mult_transpose, MatCeedContext *ctx); +PETSC_CEED_EXTERN PetscErrorCode MatCeedContextReference(MatCeedContext ctx); +PETSC_CEED_EXTERN PetscErrorCode MatCeedContextReferenceCopy(MatCeedContext ctx, MatCeedContext *ctx_copy); +PETSC_CEED_EXTERN PetscErrorCode MatCeedContextDestroy(MatCeedContext *ctx); + +// MatCEED +PETSC_CEED_EXTERN PetscErrorCode MatGetDiagonal_Ceed(Mat A, Vec D); +PETSC_CEED_EXTERN PetscErrorCode MatMult_Ceed(Mat A, Vec X, Vec Y); +PETSC_CEED_EXTERN PetscErrorCode MatMultTranspose_Ceed(Mat A, Vec Y, Vec X); extern PetscClassId MATCEED_CLASSID; extern PetscLogEvent MATCEED_MULT, MATCEED_MULT_TRANSPOSE; diff --git a/examples/fluids/include/mat-ceed.h b/examples/fluids/include/mat-ceed.h index 75a7a612dd..b6a8c08511 100644 --- a/examples/fluids/include/mat-ceed.h +++ b/examples/fluids/include/mat-ceed.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -7,38 +7,45 @@ #pragma once #include +#include #include #include #define MATCEED "ceed" -#if defined(__clang_analyzer__) -#define MATCEED_EXTERN extern -#elif defined(__cplusplus) -#define MATCEED_EXTERN extern "C" -#else -#define MATCEED_EXTERN extern -#endif - -#if defined(__clang_analyzer__) -#define MATCEED_INTERN -#else -#define MATCEED_INTERN MATCEED_EXTERN __attribute__((visibility("hidden"))) -#endif - -// Context data -MATCEED_INTERN PetscErrorCode MatCeedCreate(DM dm_x, DM dm_y, CeedOperator op_mult, CeedOperator op_mult_transpose, Mat *mat); -MATCEED_INTERN PetscErrorCode MatCeedCopy(Mat mat_ceed, Mat mat_other); -MATCEED_INTERN PetscErrorCode MatCeedAssembleCOO(Mat mat_ceed, Mat mat_coo); -MATCEED_INTERN PetscErrorCode MatCeedSetContext(Mat mat, PetscErrorCode (*f)(void *), void *ctx); -MATCEED_INTERN PetscErrorCode MatCeedGetContext(Mat mat, void *ctx); -MATCEED_INTERN PetscErrorCode MatCeedSetInnerMatType(Mat mat, MatType type); -MATCEED_INTERN PetscErrorCode MatCeedGetInnerMatType(Mat mat, MatType *type); -MATCEED_INTERN PetscErrorCode MatCeedSetOperation(Mat mat, MatOperation op, void (*g)(void)); -MATCEED_INTERN PetscErrorCode MatCeedSetLocalVectors(Mat mat, Vec X_loc, Vec Y_loc_transpose); -MATCEED_INTERN PetscErrorCode MatCeedGetLocalVectors(Mat mat, Vec *X_loc, Vec *Y_loc_transpose); -MATCEED_INTERN PetscErrorCode MatCeedRestoreLocalVectors(Mat mat, Vec *X_loc, Vec *Y_loc_transpose); -MATCEED_INTERN PetscErrorCode MatCeedGetCeedOperators(Mat mat, CeedOperator *op_mult, CeedOperator *op_mult_transpose); -MATCEED_INTERN PetscErrorCode MatCeedRestoreCeedOperators(Mat mat, CeedOperator *op_mult, CeedOperator *op_mult_transpose); -MATCEED_INTERN PetscErrorCode MatCeedSetLogEvents(Mat mat, PetscLogEvent log_event_mult, PetscLogEvent log_event_mult_transpose); -MATCEED_INTERN PetscErrorCode MatCeedGetLogEvents(Mat mat, PetscLogEvent *log_event_mult, PetscLogEvent *log_event_mult_transpose); +// Core functionality +PETSC_CEED_EXTERN PetscErrorCode MatCreateCeed(DM dm_x, DM dm_y, CeedOperator op_mult, CeedOperator op_mult_transpose, Mat *mat); +PETSC_CEED_EXTERN PetscErrorCode MatCeedCopy(Mat mat_ceed, Mat mat_other); +PETSC_CEED_EXTERN PetscErrorCode MatCeedSetAssemblyDataUpdateNeeded(Mat mat_ceed, PetscBool update_needed); +PETSC_CEED_EXTERN PetscErrorCode MatCeedCreateMatCOO(Mat mat_ceed, Mat *mat_coo); +PETSC_CEED_EXTERN PetscErrorCode MatCeedSetPreallocationCOO(Mat mat_ceed, Mat mat_coo); +PETSC_CEED_EXTERN PetscErrorCode MatCeedAssembleCOO(Mat mat_ceed, Mat mat_coo); + +PETSC_CEED_INTERN PetscErrorCode MatCeedSetContextDouble(Mat mat, const char *name, double value); +PETSC_CEED_INTERN PetscErrorCode MatCeedGetContextDouble(Mat mat, const char *name, double *value); +PETSC_CEED_EXTERN PetscErrorCode MatCeedSetContextReal(Mat mat, const char *name, PetscReal value); +PETSC_CEED_EXTERN PetscErrorCode MatCeedGetContextReal(Mat mat, const char *name, PetscReal *value); +PETSC_CEED_INTERN PetscErrorCode MatCeedSetTime(Mat mat, PetscReal time); +PETSC_CEED_INTERN PetscErrorCode MatCeedGetTime(Mat mat, PetscReal *time); +PETSC_CEED_INTERN PetscErrorCode MatCeedSetDt(Mat mat, PetscReal dt); +PETSC_CEED_INTERN PetscErrorCode MatCeedSetShifts(Mat mat, PetscReal shift_v, PetscReal shift_a); + +// Advanced functionality +PETSC_CEED_EXTERN PetscErrorCode MatCeedSetContext(Mat mat, PetscCtxDestroyFn f, void *ctx); +PETSC_CEED_EXTERN PetscErrorCode MatCeedGetContext(Mat mat, void *ctx); + +PETSC_CEED_EXTERN PetscErrorCode MatCeedSetOperation(Mat mat, MatOperation op, void (*g)(void)); +PETSC_CEED_EXTERN PetscErrorCode MatCeedSetCOOMatType(Mat mat, MatType type); +PETSC_CEED_EXTERN PetscErrorCode MatCeedGetCOOMatType(Mat mat, MatType *type); + +PETSC_CEED_EXTERN PetscErrorCode MatCeedSetLocalVectors(Mat mat, Vec X_loc, Vec Y_loc_transpose); +PETSC_CEED_EXTERN PetscErrorCode MatCeedGetLocalVectors(Mat mat, Vec *X_loc, Vec *Y_loc_transpose); +PETSC_CEED_EXTERN PetscErrorCode MatCeedRestoreLocalVectors(Mat mat, Vec *X_loc, Vec *Y_loc_transpose); + +PETSC_CEED_EXTERN PetscErrorCode MatCeedGetCeedOperators(Mat mat, CeedOperator *op_mult, CeedOperator *op_mult_transpose); +PETSC_CEED_EXTERN PetscErrorCode MatCeedRestoreCeedOperators(Mat mat, CeedOperator *op_mult, CeedOperator *op_mult_transpose); + +PETSC_CEED_EXTERN PetscErrorCode MatCeedSetLogEvents(Mat mat, PetscLogEvent log_event_mult, PetscLogEvent log_event_mult_transpose); +PETSC_CEED_EXTERN PetscErrorCode MatCeedGetLogEvents(Mat mat, PetscLogEvent *log_event_mult, PetscLogEvent *log_event_mult_transpose); +PETSC_CEED_EXTERN PetscErrorCode MatCeedSetCeedOperatorLogEvents(Mat mat, PetscLogEvent log_event_mult, PetscLogEvent log_event_mult_transpose); +PETSC_CEED_EXTERN PetscErrorCode MatCeedGetCeedOperatorLogEvents(Mat mat, PetscLogEvent *log_event_mult, PetscLogEvent *log_event_mult_transpose); diff --git a/examples/fluids/include/ceed-utils.h b/examples/fluids/include/petsc-ceed-utils.h similarity index 84% rename from examples/fluids/include/ceed-utils.h rename to examples/fluids/include/petsc-ceed-utils.h index b7962b0f5c..54b61610cb 100644 --- a/examples/fluids/include/ceed-utils.h +++ b/examples/fluids/include/petsc-ceed-utils.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -9,15 +9,43 @@ #include #include -#define PetscCallCeed(ceed, ...) \ - do { \ - int ierr = __VA_ARGS__; \ - if (ierr != CEED_ERROR_SUCCESS) { \ - const char *error_message; \ - CeedGetErrorMessage(ceed, &error_message); \ - SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "%s", error_message); \ - } \ - } while (0) +/** + @brief Copy the reference to a `Vec`. + Note: If `vec_copy` is non-null, it is assumed to be a valid pointer to a `Vec` and `VecDestroy()` will be called. + + Collective across MPI processes. + + @param[in] vec `Vec` to reference + @param[out] vec_copy Copy of reference + + @return An error code: 0 - success, otherwise - failure +**/ +static inline PetscErrorCode VecReferenceCopy(Vec vec, Vec *vec_copy) { + PetscFunctionBeginUser; + PetscCall(PetscObjectReference((PetscObject)vec)); + PetscCall(VecDestroy(vec_copy)); + *vec_copy = vec; + PetscFunctionReturn(PETSC_SUCCESS); +} + +/** + @brief Copy the reference to a `DM`. + Note: If `dm_copy` is non-null, it is assumed to be a valid pointer to a `DM` and `DMDestroy()` will be called. + + Collective across MPI processes. + + @param[in] dm `DM` to reference + @param[out] dm_copy Copy of reference + + @return An error code: 0 - success, otherwise - failure +**/ +static inline PetscErrorCode DMReferenceCopy(DM dm, DM *dm_copy) { + PetscFunctionBeginUser; + PetscCall(PetscObjectReference((PetscObject)dm)); + PetscCall(DMDestroy(dm_copy)); + *dm_copy = dm; + PetscFunctionReturn(PETSC_SUCCESS); +} /** @brief Translate PetscMemType to CeedMemType @@ -192,6 +220,7 @@ static inline PetscErrorCode VecCopyPetscToCeed(Vec X_petsc, CeedVector x_ceed) PetscCall(VecGetArrayReadAndMemType(X_petsc, (const PetscScalar **)&x, &mem_type)); PetscCallCeed(ceed, CeedVectorSetArray(x_ceed, MemTypePetscToCeed(mem_type), CEED_COPY_VALUES, x)); PetscCall(VecRestoreArrayReadAndMemType(X_petsc, (const PetscScalar **)&x)); + PetscCheck(CeedDestroy(&ceed) == CEED_ERROR_SUCCESS, PetscObjectComm((PetscObject)X_petsc), PETSC_ERR_LIB, "Destroying Ceed object failed"); PetscFunctionReturn(PETSC_SUCCESS); } diff --git a/examples/fluids/include/petsc-ceed.h b/examples/fluids/include/petsc-ceed.h new file mode 100644 index 0000000000..3b3d648d15 --- /dev/null +++ b/examples/fluids/include/petsc-ceed.h @@ -0,0 +1,42 @@ +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. +// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +// +// SPDX-License-Identifier: BSD-2-Clause +// +// This file is part of CEED: http://github.com/ceed +#pragma once + +#include + +#if defined(__clang_analyzer__) +#define PETSC_CEED_EXTERN extern +#elif defined(__cplusplus) +#define PETSC_CEED_EXTERN extern "C" +#else +#define PETSC_CEED_EXTERN extern +#endif + +#if defined(__clang_analyzer__) +#define PETSC_CEED_INTERN +#else +#define PETSC_CEED_INTERN PETSC_CEED_EXTERN __attribute__((visibility("hidden"))) +#endif + +/** + @brief Calls a libCEED function and then checks the resulting error code. + If the error code is non-zero, then a PETSc error is set with the libCEED error message. +**/ +/// @ingroup RatelInternal +#ifndef PetscCallCeed +#define PetscCallCeed(ceed_, ...) \ + do { \ + int ierr_q_; \ + PetscStackUpdateLine; \ + ierr_q_ = __VA_ARGS__; \ + if (PetscUnlikely(ierr_q_ != CEED_ERROR_SUCCESS)) { \ + const char *error_message; \ + CeedGetErrorMessage(ceed_, &error_message); \ + SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "%s", error_message); \ + } \ + } while (0) +#endif diff --git a/examples/fluids/include/petsc_ops.h b/examples/fluids/include/petsc_ops.h index 9913780172..d614df60ab 100644 --- a/examples/fluids/include/petsc_ops.h +++ b/examples/fluids/include/petsc_ops.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause diff --git a/examples/fluids/include/smartsim.h b/examples/fluids/include/smartsim.h deleted file mode 100644 index f8ba943e5f..0000000000 --- a/examples/fluids/include/smartsim.h +++ /dev/null @@ -1,30 +0,0 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. -// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. -// -// SPDX-License-Identifier: BSD-2-Clause -// -// This file is part of CEED: http://github.com/ceed -#pragma once - -#include -#include -#include - -#if defined(__clang_analyzer__) -void PetscCallSmartRedis(SRError); -#else -#define PetscCallSmartRedis(...) \ - do { \ - SRError ierr_smartredis_call_q_; \ - PetscBool disable_calls = PETSC_FALSE; \ - PetscStackUpdateLine; \ - PetscCall(PetscOptionsGetBool(NULL, NULL, "-smartsim_disable_calls", &disable_calls, NULL)); \ - if (disable_calls == PETSC_TRUE) break; \ - ierr_smartredis_call_q_ = __VA_ARGS__; \ - if (PetscUnlikely(ierr_smartredis_call_q_ != SRNoError)) \ - SETERRQ(PETSC_COMM_SELF, (PetscErrorCode)ierr_smartredis_call_q_, "SmartRedis Error (Code %d): %s", ierr_smartredis_call_q_, \ - SRGetLastError()); \ - } while (0) -#endif - -PetscErrorCode SmartRedisVerifyPutTensor(void *c_client, const char *name, const size_t name_length); diff --git a/examples/fluids/index.md b/examples/fluids/index.md index 9c53bef0f3..27fe5b9a3c 100644 --- a/examples/fluids/index.md +++ b/examples/fluids/index.md @@ -9,7 +9,7 @@ Moreover, the Navier-Stokes example has been developed using PETSc, so that the ## Running the mini-app ```{include} README.md -:start-after: inclusion-fluids-marker +:start-after: ``` ## The Navier-Stokes equations @@ -317,65 +317,111 @@ Note that this wave speed is specific to ideal gases as $\gamma$ is an ideal gas Currently, this demo provides three types of problems/physical models that can be selected at run time via the option `-problem`. {ref}`problem-advection`, the problem of the transport of energy in a uniform vector velocity field, {ref}`problem-euler-vortex`, the exact solution to the Euler equations, and the so called {ref}`problem-density-current` problem. -### Subgrid Stress Modeling +### Statistics Collection +For scale-resolving simulations (such as LES and DNS), statistics for a simulation are more often useful than time-instantaneous snapshots of the simulation itself. +To make this process more computationally efficient, averaging in the spanwise direction, if physically correct, can help reduce the amount of simulation time needed to get converged statistics. -When a fluid simulation is under-resolved (the smallest length scale resolved by the grid is much larger than the smallest physical scale, the [Kolmogorov length scale](https://en.wikipedia.org/wiki/Kolmogorov_microscales)), this is mathematically interpreted as filtering the Navier-Stokes equations. -This is known as large-eddy simulation (LES), as only the "large" scales of turbulence are resolved. -This filtering operation results in an extra stress-like term, $\bm{\tau}^r$, representing the effect of unresolved (or "subgrid" scale) structures in the flow. -Denoting the filtering operation by $\overline \cdot$, the LES governing equations are: +First, let's more precisely define what we mean by spanwise average. +Denote $\langle \phi \rangle$ as the Reynolds average of $\phi$, which in this case would be a average over the spanwise direction and time: $$ -\frac{\partial \bm{\overline q}}{\partial t} + \nabla \cdot \bm{\overline F}(\bm{\overline q}) -S(\bm{\overline q}) = 0 \, , -$$ (eq-vector-les) +\langle \phi \rangle(x,y) = \frac{1}{L_z + (T_f - T_0)}\int_0^{L_z} \int_{T_0}^{T_f} \phi(x, y, z, t) \mathrm{d}t \mathrm{d}z +$$ -where +where $z$ is the spanwise direction, the domain has size $[0, L_z]$ in the spanwise direction, and $[T_0, T_f]$ is the range of time being averaged over. +Note that here and in the code, **we assume the spanwise direction to be in the $z$ direction**. + +To discuss the details of the implementation we'll first discuss the spanwise integral, then the temporal integral, and lastly the statistics themselves. + +#### Spanwise Integral +The function $\langle \phi \rangle (x,y)$ is represented on a 2-D finite element grid, taken from the full domain mesh itself. +If isoperiodicity is set, the periodic face is extracted as the spanwise statistics mesh. +Otherwise the negative z face is used. +We'll refer to this mesh as the *parent grid*, as for every "parent" point in the parent grid, there are many "child" points in the full domain. +Define a function space on the parent grid as $\mathcal{V}_p^\mathrm{parent} = \{ \bm v(\bm x) \in H^{1}(\Omega_e^\mathrm{parent}) \,|\, \bm v(\bm x_e(\bm X)) \in P_p(\bm{I}), e=1,\ldots,N_e \}$. +We enforce that the order of the parent FEM space is equal to the full domain's order. + +Many statistics are the product of 2 or more solution functions, which results in functions of degree higher than the parent FEM space, $\mathcal{V}_p^\mathrm{parent}$. +To represent these higher-order functions on the parent FEM space, we perform an $L^2$ projection. +Define the spanwise averaged function as: $$ -\bm{\overline F}(\bm{\overline q}) = -\bm{F} (\bm{\overline q}) + -\begin{pmatrix} - 0\\ - \bm{\tau}^r \\ - \bm{u} \cdot \bm{\tau}^r -\end{pmatrix} -$$ (eq-les-flux) - -More details on deriving the above expression, filtering, and large eddy simulation can be found in {cite}`popeTurbulentFlows2000`. -To close the problem, the subgrid stress must be defined. -For implicit LES, the subgrid stress is set to zero and the numerical properties of the discretized system are assumed to account for the effect of subgrid scale structures on the filtered solution field. -For explicit LES, it is defined by a subgrid stress model. - -(sgs-dd-model)= -#### Data-driven SGS Model - -The data-driven SGS model implemented here uses a small neural network to compute the SGS term. -The SGS tensor is calculated at nodes using an $L^2$ projection of the velocity gradient and grid anisotropy tensor, and then interpolated onto quadrature points. -More details regarding the theoretical background of the model can be found in {cite}`prakashDDSGS2022` and {cite}`prakashDDSGSAnisotropic2022`. - -The neural network itself consists of 1 hidden layer and 20 neurons, using Leaky ReLU as its activation function. -The slope parameter for the Leaky ReLU function is set via `-sgs_model_dd_leakyrelu_alpha`. -The outputs of the network are assumed to be normalized on a min-max scale, so they must be rescaled by the original min-max bounds. -Parameters for the neural network are put into files in a directory found in `-sgs_model_dd_parameter_dir`. -These files store the network weights (`w1.dat` and `w2.dat`), biases (`b1.dat` and `b2.dat`), and scaling parameters (`OutScaling.dat`). -The first row of each files stores the number of columns and rows in each file. -Note that the weight coefficients are assumed to be in column-major order. -This is done to keep consistent with legacy file compatibility. +\langle \phi \rangle_z(x,y,t) = \frac{1}{L_z} \int_0^{L_z} \phi(x, y, z, t) \mathrm{d}z +$$ -:::{note} -The current data-driven model parameters are not accurate and are for regression testing only. -::: +where the function $\phi$ may be the product of multiple solution functions and $\langle \phi \rangle_z$ denotes the spanwise average. +The projection of a function $u$ onto the parent FEM space would look like: + +$$ +\bm M u_N = \int_0^{L_x} \int_0^{L_y} u \psi^\mathrm{parent}_N \mathrm{d}y \mathrm{d}x +$$ +where $\bm M$ is the mass matrix for $\mathcal{V}_p^\mathrm{parent}$, $u_N$ the coefficients of the projected function, and $\psi^\mathrm{parent}_N$ the basis functions of the parent FEM space. +Substituting the spanwise average of $\phi$ for $u$, we get: + +$$ +\bm M [\langle \phi \rangle_z]_N = \int_0^{L_x} \int_0^{L_y} \left [\frac{1}{L_z} \int_0^{L_z} \phi(x,y,z,t) \mathrm{d}z \right ] \psi^\mathrm{parent}_N(x,y) \mathrm{d}y \mathrm{d}x +$$ -##### Data-driven Model Using External Libraries +The triple integral in the right hand side is just an integral over the full domain -There are two different modes for using the data-driven model: fused and sequential. +$$ +\bm M [\langle \phi \rangle_z]_N = \frac{1}{L_z} \int_\Omega \phi(x,y,z,t) \psi^\mathrm{parent}_N(x,y) \mathrm{d}\Omega +$$ + +We need to evaluate $\psi^\mathrm{parent}_N$ at quadrature points in the full domain. +To do this efficiently, **we assume and exploit the full domain grid to be a tensor product in the spanwise direction**. +This assumption means quadrature points in the full domain have the same $(x,y)$ coordinate location as quadrature points in the parent domain. +This also allows the use of the full domain quadrature weights for the triple integral. -In fused mode, the input processing, model inference, and output handling were all done in a single CeedOperator. -Conversely, sequential mode has separate function calls/CeedOperators for input creation, model inference, and output handling. -By separating the three steps to the model evaluation, the sequential mode allows for functions calling external libraries to be used for the model inference step. -This however is slower than the fused kernel, but this requires a native libCEED inference implementation. +#### Temporal Integral/Averaging +To calculate the temporal integral, we do a running average using left-rectangle rule. +At the beginning of each simulation, the time integral of a statistic is set to 0, $\overline{\phi} = 0$. +Periodically, the integral is updated using left-rectangle rule: -To use the fused mode, set `-sgs_model_dd_use_fused true`. -To use the sequential mode, set the same flag to `false`. +$$\overline{\phi}_\mathrm{new} = \overline{\phi}_{\mathrm{old}} + \phi(t_\mathrm{new}) \Delta T$$ +where $\phi(t_\mathrm{new})$ is the statistic at the current time and $\Delta T$ is the time since the last update. +When stats are written out to file, this running sum is then divided by $T_f - T_0$ to get the time average. + +With this method of calculating the running time average, we can plug this into the $L^2$ projection of the spanwise integral: + +$$ +\bm M [\langle \phi \rangle]_N = \frac{1}{L_z + (T_f - T_0)} \int_\Omega \int_{T_0}^{T_f} \phi(x,y,z,t) \psi^\mathrm{parent}_N \mathrm{d}t \mathrm{d}\Omega +$$ +where the integral $\int_{T_0}^{T_f} \phi(x,y,z,t) \mathrm{d}t$ is calculated on a running basis. + + +#### Running +As the simulation runs, it takes a running time average of the statistics at the full domain quadrature points. +This running average is only updated at the interval specified by `-ts_monitor_turbulence_spanstats_collect_interval` as number of timesteps. +The $L^2$ projection problem is only solved when statistics are written to file, which is controlled by `-ts_monitor_turbulence_spanstats_viewer_interval`. +Note that the averaging is not reset after each file write. +The average is always over the bounds $[T_0, T_f]$, where $T_f$ in this case would be the time the file was written at and $T_0$ is the solution time at the beginning of the run. + +#### Turbulent Statistics + +The focus here are those statistics that are relevant to turbulent flow. +The terms collected are listed below, with the mathematical definition on the left and the label (present in CGNS output files) is on the right. + +| Math | Label | +| ----------------- | -------- | +| $\langle \rho \rangle$ | MeanDensity | +| $\langle p \rangle$ | MeanPressure | +| $\langle p^2 \rangle$ | MeanPressureSquared | +| $\langle p u_i \rangle$ | MeanPressureVelocity[$i$] | +| $\langle \rho T \rangle$ | MeanDensityTemperature | +| $\langle \rho T u_i \rangle$ | MeanDensityTemperatureFlux[$i$] | +| $\langle \rho u_i \rangle$ | MeanMomentum[$i$] | +| $\langle \rho u_i u_j \rangle$ | MeanMomentumFlux[$ij$] | +| $\langle u_i \rangle$ | MeanVelocity[$i$] | + +where [$i$] are suffixes to the labels. So $\langle \rho u_x u_y \rangle$ would correspond to MeanMomentumFluxXY. +This naming convention attempts to mimic the CGNS standard. + +To get second-order statistics from these terms, simply use the identity: + +$$ +\langle \phi' \theta' \rangle = \langle \phi \theta \rangle - \langle \phi \rangle \langle \theta \rangle +$$ (differential-filtering)= ### Differential Filtering @@ -470,42 +516,6 @@ To match the "size" of a normal kernel to our differential kernel, we attempt to To match the box and Gaussian filters "sizes", we use $\beta = 1/10$ and $\beta = 1/6$, respectively. $\beta$ can be set via `-diff_filter_kernel_scaling`. -### *In Situ* Machine-Learning Model Training -Training machine-learning models normally uses *a priori* (already gathered) data stored on disk. -This is computationally inefficient, particularly as the scale of the problem grows and the data that is saved to disk reduces to a small percentage of the total data generated by a simulation. -One way of working around this to to train a model on data coming from an ongoing simulation, known as *in situ* (in place) learning. - -This is implemented in the code using [SmartSim](https://www.craylabs.org/docs/overview.html). -Briefly, the fluid simulation will periodically place data for training purposes into a database that a separate process uses to train a model. -The database used by SmartSim is [Redis](https://redis.com/modules/redis-ai/) and the library to connect to the database is called [SmartRedis](https://www.craylabs.org/docs/smartredis.html). -More information about how to utilize this code in a SmartSim configuration can be found on [SmartSim's website](https://www.craylabs.org/docs/overview.html). - -To use this code in a SmartSim *in situ* setup, first the code must be built with SmartRedis enabled. -This is done by specifying the installation directory of SmartRedis using the `SMARTREDIS_DIR` environment variable when building: - -``` -make SMARTREDIS_DIR=~/software/smartredis/install -``` - -#### SGS Data-Driven Model *In Situ* Training -Currently the code is only setup to do *in situ* training for the SGS data-driven model. -Training data is split into the model inputs and outputs. -The model inputs are calculated as the same model inputs in the SGS Data-Driven model described {ref}`earlier`. -The model outputs (or targets in the case of training) are the subgrid stresses. -Both the inputs and outputs are computed from a filtered velocity field, which is calculated via {ref}`differential-filtering`. -The settings for the differential filtering used during training are described in {ref}`differential-filtering`. -The training will create multiple sets of data per each filter width defined in `-sgs_train_filter_widths`. -Those scalar filter widths correspond to the scaling correspond to $\bm{D} = c \bm{I}$, where $c$ is the scalar filter width. - -The SGS *in situ* training can be enabled using the `-sgs_train_enable` flag. -Data can be processed and placed into the database periodically. -The interval between is controlled by `-sgs_train_write_data_interval`. -There's also the choice of whether to add new training data on each database write or to overwrite the old data with new data. -This is controlled by `-sgs_train_overwrite_data`. - -The database may also be located on the same node as a MPI rank (collocated) or located on a separate node (distributed). -It's necessary to know how many ranks are associated with each collocated database, which is set by `-smartsim_collocated_database_num_ranks`. - (problem-advection)= ## Advection-Diffusion @@ -855,20 +865,17 @@ numerous terms in the STG formulation. #### Internal Damping Layer (IDL) The STG inflow boundary condition creates large amplitude acoustic waves. -We use an internal damping layer (IDL) to damp them out without disrupting the synthetic structures developing into natural turbulent structures. This implementation was inspired from -{cite}`shurSTG2014`, but is implemented here as a ramped volumetric forcing -term, similar to a sponge layer (see 8.4.2.4 in {cite}`colonius2023turbBC` for example). It takes the following form: +We use an internal damping layer (IDL) to damp them out without disrupting the synthetic structures developing into natural turbulent structures. +This implementation was inspired by {cite}`shurSTG2014`, but is implemented here as a ramped volumetric forcing term, similar to a sponge layer (see 8.4.2.4 in {cite}`colonius2023turbBC` for example). +It takes the following form: $$ S(\bm{q}) = -\sigma(\bm{x})\left.\frac{\partial \bm{q}}{\partial \bm{Y}}\right\rvert_{\bm{q}} \bm{Y}' $$ -where $\bm{Y}' = [P - P_\mathrm{ref}, \bm{0}, 0]^T$, and $\sigma(\bm{x})$ is a -linear ramp starting at `-idl_start` with length `-idl_length` and an amplitude -of inverse `-idl_decay_rate`. The damping is defined in terms of a pressure-primitive -anomaly $\bm Y'$ converted to conservative source using $\partial -\bm{q}/\partial \bm{Y}\rvert_{\bm{q}}$, which is linearized about the current -flow state. $P_\mathrm{ref}$ is defined via the `-reference_pressure` flag. +where $\bm{Y}' = [P - P_\mathrm{ref}, \bm{0}, 0]^T$, and $\sigma(\bm{x})$ is a linear ramp starting at `-idl_start` with length `-idl_length` and an amplitude of inverse `-idl_decay_rate`. +The damping is defined in terms of a pressure-primitive anomaly $\bm Y'$ converted to conservative source using $\partial \bm{q}/\partial \bm{Y}\rvert_{\bm{q}}$, which is linearized about the current flow state. +$P_\mathrm{ref}$ has a default value equal to `-reference_pressure` flag, with an optional flag `-idl_pressure` to set it to a different value. ### Meshing diff --git a/examples/fluids/navierstokes.c b/examples/fluids/navierstokes.c index 5741119dde..0b674bd660 100644 --- a/examples/fluids/navierstokes.c +++ b/examples/fluids/navierstokes.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -18,18 +18,18 @@ // ./navierstokes -ceed /cpu/self -options_file gaussianwave.yml // ./navierstokes -ceed /gpu/cuda -problem advection -degree 1 // -//TESTARGS(name="Gaussian Wave, explicit, supg") -ceed {ceed_resource} -test_type solver -options_file examples/fluids/gaussianwave.yaml -compare_final_state_atol 1e-8 -compare_final_state_filename examples/fluids/tests-output/fluids-navierstokes-gaussianwave-explicit.bin -dm_plex_box_faces 2,2,1 -ts_max_steps 5 -degree 3 -implicit false -ts_type rk -stab supg -state_var conservative -mass_ksp_type gmres -mass_pc_jacobi_type diagonal +//TESTARGS(name="Newtonian and Riemann Solver Unit Tests",only="cpu") -ceed {ceed_resource} -test_type solver -options_file examples/fluids/gaussianwave.yaml -compare_final_state_atol 1e100 -compare_final_state_filename examples/fluids/tests-output/fluids-navierstokes-gaussianwave-IDL-entropy.bin -dm_plex_box_faces 5,5,1 -ts_max_steps 0 -newtonian_unit_tests -riemann_solver_unit_tests +//TESTARGS(name="Gaussian Wave, IDL and Entropy variables") -ceed {ceed_resource} -test_type solver -options_file examples/fluids/gaussianwave.yaml -compare_final_state_atol 2e-11 -compare_final_state_filename examples/fluids/tests-output/fluids-navierstokes-gaussianwave-IDL-entropy.bin -state_var entropy -dm_plex_box_faces 5,5,1 -ts_max_steps 5 -idl_decay_time 2e-3 -idl_length 0.25 -idl_start 0 -idl_pressure 70 +//TESTARGS(name="Gaussian Wave, explicit, supg, IDL") -ceed {ceed_resource} -test_type solver -options_file examples/fluids/gaussianwave.yaml -compare_final_state_atol 1e-8 -compare_final_state_filename examples/fluids/tests-output/fluids-navierstokes-gaussianwave-explicit.bin -dm_plex_box_faces 2,2,1 -ts_max_steps 5 -degree 3 -implicit false -ts_type rk -stab supg -state_var conservative -mass_ksp_type gmres -mass_pc_jacobi_type diagonal -idl_decay_time 2e-3 -idl_length 0.25 -idl_start 0 -idl_pressure 70 //TESTARGS(name="Advection 2D, rotation, explicit, supg, consistent mass") -ceed {ceed_resource} -test_type solver -problem advection -degree 3 -dm_plex_box_faces 2,2 -dm_plex_box_lower 0,0 -dm_plex_box_upper 125,125 -bc_wall 1,2,3,4 -wall_comps 4 -units_kilogram 1e-9 -rc 100. -ts_dt 1e-3 -ts_max_steps 10 -stab supg -Ctaus 0.5 -mass_ksp_type gmres -mass_pc_type vpbjacobi -compare_final_state_atol 1e-10 -compare_final_state_filename examples/fluids/tests-output/fluids-navierstokes-adv2d-rotation-explicit-stab-supg-consistent-mass.bin -//TESTARGS(name="Advection, skew") -ceed {ceed_resource} -test_type solver -options_file examples/fluids/advection.yaml -ts_max_steps 5 -wind_type translation -wind_translation -0.5547002,0.83205029,0 -advection_ic_type skew -dm_plex_box_faces 2,1,1 -degree 2 -stab supg -stab_tau advdiff_shakib -Ctau_a 4 -ksp_type gmres -diffusion_coeff 5e-4 -compare_final_state_atol 5e-10 -compare_final_state_filename examples/fluids/tests-output/fluids-navierstokes-adv-skew.bin +//TESTARGS(name="Advection, skew") -ceed {ceed_resource} -test_type solver -options_file examples/fluids/advection.yaml -ts_max_steps 5 -wind_type translation -wind_translation -0.5547002,0.83205029,0 -advection_ic_type skew -dm_plex_box_faces 2,1,1 -degree 2 -stab supg -stab_tau advdiff_shakib -Ctau_a 4 -ksp_type gmres -diffusion_coeff 5e-4 -compare_final_state_atol 7e-10 -compare_final_state_filename examples/fluids/tests-output/fluids-navierstokes-adv-skew.bin //TESTARGS(name="Blasius, bc_slip") -ceed {ceed_resource} -test_type solver -options_file examples/fluids/blasius.yaml -ts_max_steps 5 -dm_plex_box_faces 3,20,1 -platemesh_nDelta 10 -platemesh_growth 1.2 -bc_outflow 5 -bc_slip 4 -compare_final_state_atol 2E-11 -compare_final_state_filename examples/fluids/tests-output/fluids-navierstokes-blasius-bc_slip.bin -//TESTARGS(name="Blasius, SGS DataDriven Sequential") -ceed {ceed_resource} -options_file examples/fluids/tests-output/blasius_stgtest.yaml -sgs_model_type data_driven -sgs_model_dd_leakyrelu_alpha 0.3 -sgs_model_dd_parameter_dir examples/fluids/dd_sgs_data -ts_dt 2e-9 -state_var primitive -ksp_rtol 1e-12 -snes_rtol 1e-12 -stg_mean_only -stg_fluctuating_IC -test_type solver -compare_final_state_atol 1e-10 -compare_final_state_filename examples/fluids/tests-output/fluids-navierstokes-blasius-sgs-data-driven.bin -sgs_model_dd_use_fused false //TESTARGS(name="Advection, rotation, cosine") -ceed {ceed_resource} -test_type solver -options_file examples/fluids/advection.yaml -ts_max_steps 0 -advection_ic_type cosine_hill -dm_plex_box_faces 2,1,1 -compare_final_state_atol 1e-10 -compare_final_state_filename examples/fluids/tests-output/fluids-navierstokes-adv-rotation-cosine.bin //TESTARGS(name="Gaussian Wave, using MatShell") -ceed {ceed_resource} -test_type solver -options_file examples/fluids/gaussianwave.yaml -compare_final_state_atol 1e-8 -compare_final_state_filename examples/fluids/tests-output/fluids-navierstokes-gaussianwave-shell.bin -dm_plex_box_faces 2,2,1 -ts_max_steps 5 -degree 3 -amat_type shell -pc_type vpbjacobi -ts_alpha_radius 0.5 //TESTARGS(name="Taylor-Green Vortex IC") -ceed {ceed_resource} -problem taylor_green -test_type solver -dm_plex_dim 3 -dm_plex_box_faces 6,6,6 -ts_max_steps 0 -compare_final_state_atol 1e-12 -compare_final_state_filename examples/fluids/tests-output/fluids-navierstokes-taylor-green-IC.bin -//TESTARGS(name="Blasius, SGS DataDriven Fused") -ceed {ceed_resource} -options_file examples/fluids/tests-output/blasius_stgtest.yaml -sgs_model_type data_driven -sgs_model_dd_leakyrelu_alpha 0.3 -sgs_model_dd_parameter_dir examples/fluids/dd_sgs_data -ts_dt 2e-9 -state_var primitive -ksp_rtol 1e-12 -snes_rtol 1e-12 -stg_mean_only -stg_fluctuating_IC -test_type solver -compare_final_state_atol 1e-10 -compare_final_state_filename examples/fluids/tests-output/fluids-navierstokes-blasius-sgs-data-driven.bin //TESTARGS(name="Blasius, Anisotropic Differential Filter") -ceed {ceed_resource} -test_type diff_filter -options_file examples/fluids/tests-output/blasius_test.yaml -compare_final_state_atol 5e-10 -compare_final_state_filename examples/fluids/tests-output/fluids-navierstokes-blasius_diff_filter_aniso_vandriest.bin -diff_filter_monitor -ts_max_steps 0 -state_var primitive -diff_filter_friction_length 1e-5 -diff_filter_wall_damping_function van_driest -diff_filter_ksp_rtol 1e-8 -diff_filter_grid_based_width -diff_filter_width_scaling 1,0.7,1 //TESTARGS(name="Blasius, Isotropic Differential Filter") -ceed {ceed_resource} -test_type diff_filter -options_file examples/fluids/tests-output/blasius_test.yaml -compare_final_state_atol 2e-12 -compare_final_state_filename examples/fluids/tests-output/fluids-navierstokes-blasius_diff_filter_iso.bin -diff_filter_monitor -ts_max_steps 0 -diff_filter_width_scaling 4.2e-5,4.2e-5,4.2e-5 -diff_filter_ksp_atol 1e-14 -diff_filter_ksp_rtol 1e-16 -//TESTARGS(name="Gaussian Wave, with IDL") -ceed {ceed_resource} -test_type solver -options_file examples/fluids/gaussianwave.yaml -compare_final_state_atol 2e-11 -compare_final_state_filename examples/fluids/tests-output/fluids-navierstokes-gaussianwave-IDL.bin -dm_plex_box_faces 5,5,1 -ts_max_steps 5 -idl_decay_time 2e-3 -idl_length 0.25 -idl_start 0 -ts_alpha_radius 0.5 +//TESTARGS(name="Gaussian Wave, with IDL") -ceed {ceed_resource} -test_type solver -options_file examples/fluids/gaussianwave.yaml -compare_final_state_atol 2e-11 -compare_final_state_filename examples/fluids/tests-output/fluids-navierstokes-gaussianwave-IDL.bin -dm_plex_box_faces 5,5,1 -ts_max_steps 5 -idl_decay_time 2e-3 -idl_length 0.25 -idl_start 0 -ts_alpha_radius 0.5 -idl_pressure 70 //TESTARGS(name="Spanwise Turbulence Statistics") -ceed {ceed_resource} -test_type turb_spanstats -options_file examples/fluids/tests-output/stats_test.yaml -compare_final_state_atol 1E-11 -compare_final_state_filename examples/fluids/tests-output/fluids-navierstokes-turb-spanstats-stats.bin //TESTARGS(name="Blasius") -ceed {ceed_resource} -test_type solver -options_file examples/fluids/tests-output/blasius_test.yaml -compare_final_state_atol 2E-11 -compare_final_state_filename examples/fluids/tests-output/fluids-navierstokes-blasius.bin //TESTARGS(name="Blasius, STG Inflow") -ceed {ceed_resource} -test_type solver -options_file examples/fluids/tests-output/blasius_stgtest.yaml -compare_final_state_atol 2E-11 -compare_final_state_filename examples/fluids/tests-output/fluids-navierstokes-blasius_STG.bin @@ -89,10 +89,10 @@ int main(int argc, char **argv) { Units units; PetscCall(PetscCalloc1(1, &units)); - user->app_ctx = app_ctx; - user->units = units; - user->phys = phys_ctx; - problem->bc_from_ics = PETSC_TRUE; + user->app_ctx = app_ctx; + user->units = units; + user->phys = phys_ctx; + problem->set_bc_from_ics = PETSC_TRUE; PetscCall(RegisterLogEvents()); @@ -106,6 +106,7 @@ int main(int argc, char **argv) { MPI_Comm comm = PETSC_COMM_WORLD; user->comm = comm; PetscCall(ProcessCommandLineOptions(comm, app_ctx, bc)); + PetscCall(BoundaryConditionSetUp(user, problem, app_ctx, bc)); // --------------------------------------------------------------------------- // Initialize libCEED @@ -208,7 +209,7 @@ int main(int argc, char **argv) { // We use this for the main simulation DM because the reference DMPlexInsertBoundaryValues() is very slow on the GPU due to extra device-to-host // communication. If we disable this, we should still get the same results due to the problem->bc function, but with potentially much slower // execution. - if (problem->bc_from_ics) { + if (problem->set_bc_from_ics) { PetscCall(SetBCsFromICs(dm, Q, user->Q_loc)); } @@ -229,9 +230,6 @@ int main(int argc, char **argv) { PetscCall(SetupICsFromBinary(comm, app_ctx, Q)); } - // Print problem summary - if (app_ctx->test_type == TESTTYPE_NONE) PetscCall(PrintRunInfo(user, phys_ctx, problem, comm)); - // -- Zero Q_loc PetscCall(VecZeroEntries(user->Q_loc)); @@ -240,7 +238,7 @@ int main(int argc, char **argv) { // --------------------------------------------------------------------------- TS ts; PetscScalar final_time; - PetscCall(TSSolve_NS(dm, user, app_ctx, phys_ctx, &Q, &final_time, &ts)); + PetscCall(TSSolve_NS(dm, user, app_ctx, phys_ctx, problem, &Q, &final_time, &ts)); // --------------------------------------------------------------------------- // Post-processing @@ -253,10 +251,7 @@ int main(int argc, char **argv) { PetscCall(TurbulenceStatisticsDestroy(user, ceed_data)); PetscCall(NodalProjectionDataDestroy(user->grad_velo_proj)); - PetscCall(SgsDDDataDestroy(user->sgs_dd_data)); PetscCall(DifferentialFilterDataDestroy(user->diff_filter)); - PetscCall(SGS_DD_TrainingDataDestroy(user->sgs_dd_train)); - PetscCall(SmartSimDataDestroy(user->smartsim)); // -- Vectors PetscCallCeed(ceed, CeedVectorDestroy(&ceed_data->x_coord)); @@ -268,9 +263,6 @@ int main(int argc, char **argv) { // -- Bases PetscCallCeed(ceed, CeedBasisDestroy(&ceed_data->basis_q)); PetscCallCeed(ceed, CeedBasisDestroy(&ceed_data->basis_x)); - PetscCallCeed(ceed, CeedBasisDestroy(&ceed_data->basis_xc)); - PetscCallCeed(ceed, CeedBasisDestroy(&ceed_data->basis_q_sur)); - PetscCallCeed(ceed, CeedBasisDestroy(&ceed_data->basis_x_sur)); // -- Restrictions PetscCallCeed(ceed, CeedElemRestrictionDestroy(&ceed_data->elem_restr_q)); @@ -288,30 +280,14 @@ int main(int argc, char **argv) { PetscCallCeed(ceed, CeedQFunctionContextDestroy(&problem->apply_freestream_jacobian.qfunction_context)); PetscCallCeed(ceed, CeedQFunctionContextDestroy(&problem->apply_slip.qfunction_context)); PetscCallCeed(ceed, CeedQFunctionContextDestroy(&problem->apply_slip_jacobian.qfunction_context)); - PetscCallCeed(ceed, CeedQFunctionContextDestroy(&problem->setup_sur.qfunction_context)); - PetscCallCeed(ceed, CeedQFunctionContextDestroy(&problem->setup_vol.qfunction_context)); PetscCallCeed(ceed, CeedQFunctionContextDestroy(&problem->ics.qfunction_context)); PetscCallCeed(ceed, CeedQFunctionContextDestroy(&problem->apply_vol_rhs.qfunction_context)); PetscCallCeed(ceed, CeedQFunctionContextDestroy(&problem->apply_vol_ifunction.qfunction_context)); PetscCallCeed(ceed, CeedQFunctionContextDestroy(&problem->apply_vol_ijacobian.qfunction_context)); } - // -- QFunctions - PetscCallCeed(ceed, CeedQFunctionDestroy(&ceed_data->qf_setup_vol)); - PetscCallCeed(ceed, CeedQFunctionDestroy(&ceed_data->qf_ics)); - PetscCallCeed(ceed, CeedQFunctionDestroy(&ceed_data->qf_rhs_vol)); - PetscCallCeed(ceed, CeedQFunctionDestroy(&ceed_data->qf_ifunction_vol)); - PetscCallCeed(ceed, CeedQFunctionDestroy(&ceed_data->qf_setup_sur)); - PetscCallCeed(ceed, CeedQFunctionDestroy(&ceed_data->qf_apply_inflow)); - PetscCallCeed(ceed, CeedQFunctionDestroy(&ceed_data->qf_apply_inflow_jacobian)); - PetscCallCeed(ceed, CeedQFunctionDestroy(&ceed_data->qf_apply_freestream)); - PetscCallCeed(ceed, CeedQFunctionDestroy(&ceed_data->qf_apply_freestream_jacobian)); - // -- Operators - PetscCallCeed(ceed, CeedOperatorDestroy(&ceed_data->op_setup_vol)); PetscCall(OperatorApplyContextDestroy(ceed_data->op_ics_ctx)); - PetscCallCeed(ceed, CeedOperatorDestroy(&user->op_rhs_vol)); - PetscCallCeed(ceed, CeedOperatorDestroy(&user->op_ifunction_vol)); PetscCall(OperatorApplyContextDestroy(user->op_rhs_ctx)); PetscCall(OperatorApplyContextDestroy(user->op_strong_bc_ctx)); PetscCallCeed(ceed, CeedOperatorDestroy(&user->op_ifunction)); @@ -353,8 +329,13 @@ int main(int argc, char **argv) { PetscCall(PetscFree(app_ctx->amat_type)); PetscCall(PetscFree(app_ctx->wall_forces.walls)); PetscCall(PetscViewerDestroy(&app_ctx->wall_forces.viewer)); + PetscCall(PetscViewerDestroy(&app_ctx->turb_spanstats_viewer)); // -- Structs + for (PetscInt i = 0; i < problem->num_bc_defs; i++) { + PetscCall(BCDefinitionDestroy(&problem->bc_defs[i])); + } + PetscCall(PetscFree(problem->bc_defs)); PetscCall(PetscFree(units)); PetscCall(PetscFree(user)); PetscCall(PetscFree(problem)); diff --git a/examples/fluids/navierstokes.h b/examples/fluids/navierstokes.h index 49795d2b5f..26ba140814 100644 --- a/examples/fluids/navierstokes.h +++ b/examples/fluids/navierstokes.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -6,17 +6,19 @@ // This file is part of CEED: http://github.com/ceed #pragma once -#include #include +#include +#include #include +#include #include #include #include "./include/petsc_ops.h" #include "qfunctions/newtonian_types.h" -#if PETSC_VERSION_LT(3, 21, 0) -#error "PETSc v3.21 or later is required" +#if PETSC_VERSION_LT(3, 23, 0) +#error "PETSc v3.23 or later is required" #endif // ----------------------------------------------------------------------------- @@ -32,23 +34,22 @@ typedef enum { EULER_TEST_4 = 4, EULER_TEST_5 = 5, } EulerTestType; -static const char *const EulerTestTypes[] = {"isentropic_vortex", "test_1", "test_2", "test_3", "test_4", "test_5", - "EulerTestType", "EULER_TEST_", NULL}; +static const char *const EulerTestTypes[] = {"ISENTROPIC_VORTEX", "1", "2", "3", "4", "5", "EulerTestType", "EULER_TEST_", NULL}; // Advection - Wind types -static const char *const WindTypes[] = {"rotation", "translation", "WindType", "WIND_", NULL}; +static const char *const WindTypes[] = {"ROTATION", "TRANSLATION", "WindType", "WIND_", NULL}; // Advection - Initial Condition Types -static const char *const AdvectionICTypes[] = {"sphere", "cylinder", "cosine_hill", "skew", "AdvectionICType", "ADVECTIONIC_", NULL}; +static const char *const AdvectionICTypes[] = {"SPHERE", "CYLINDER", "COSINE_HILL", "SKEW", "AdvectionICType", "ADVECTIONIC_", NULL}; // Advection - Bubble Continuity Types -static const char *const BubbleContinuityTypes[] = {"smooth", "back_sharp", "thick", "cosine", "BubbleContinuityType", "BUBBLE_CONTINUITY_", NULL}; +static const char *const BubbleContinuityTypes[] = {"SMOOTH", "BACK_SHARP", "THICK", "COSINE", "BubbleContinuityType", "BUBBLE_CONTINUITY_", NULL}; // Stabilization methods -static const char *const StabilizationTypes[] = {"none", "SU", "SUPG", "StabilizationType", "STAB_", NULL}; +static const char *const StabilizationTypes[] = {"NONE", "SU", "SUPG", "StabilizationType", "STAB_", NULL}; // Stabilization tau constants -static const char *const StabilizationTauTypes[] = {"Ctau", "AdvDiff_Shakib", "AdvDiff_Shakib_P", "StabilizationTauType", "STAB_TAU_", NULL}; +static const char *const StabilizationTauTypes[] = {"CTAU", "ADVDIFF_SHAKIB", "ADVDIFF_SHAKIB_P", "StabilizationTauType", "STAB_TAU_", NULL}; // Test mode type typedef enum { @@ -57,50 +58,29 @@ typedef enum { TESTTYPE_TURB_SPANSTATS = 2, TESTTYPE_DIFF_FILTER = 3, } TestType; -static const char *const TestTypes[] = {"none", "solver", "turb_spanstats", "diff_filter", "TestType", "TESTTYPE_", NULL}; - -// Subgrid-Stress mode type -typedef enum { - SGS_MODEL_NONE = 0, - SGS_MODEL_DATA_DRIVEN = 1, -} SGSModelType; -static const char *const SGSModelTypes[] = {"none", "data_driven", "SGSModelType", "SGS_MODEL_", NULL}; +static const char *const TestTypes[] = {"NONE", "SOLVER", "TURB_SPANSTATS", "DIFF_FILTER", "TestType", "TESTTYPE_", NULL}; // Mesh transformation type typedef enum { MESH_TRANSFORM_NONE = 0, MESH_TRANSFORM_PLATEMESH = 1, } MeshTransformType; -static const char *const MeshTransformTypes[] = {"none", "platemesh", "MeshTransformType", "MESH_TRANSFORM_", NULL}; +static const char *const MeshTransformTypes[] = {"NONE", "PLATEMESH", "MeshTransformType", "MESH_TRANSFORM_", NULL}; static const char *const DifferentialFilterDampingFunctions[] = { - "none", "van_driest", "mms", "DifferentialFilterDampingFunction", "DIFF_FILTER_DAMP_", NULL}; - -// ----------------------------------------------------------------------------- -// Log Events -// ----------------------------------------------------------------------------- -extern PetscLogEvent FLUIDS_CeedOperatorApply; -extern PetscLogEvent FLUIDS_CeedOperatorAssemble; -extern PetscLogEvent FLUIDS_CeedOperatorAssembleDiagonal; -extern PetscLogEvent FLUIDS_CeedOperatorAssemblePointBlockDiagonal; -extern PetscLogEvent FLUIDS_SmartRedis_Init; -extern PetscLogEvent FLUIDS_SmartRedis_Meta; -extern PetscLogEvent FLUIDS_SmartRedis_Train; -extern PetscLogEvent FLUIDS_TrainDataCompute; -extern PetscLogEvent FLUIDS_DifferentialFilter; -extern PetscLogEvent FLUIDS_VelocityGradientProjection; -PetscErrorCode RegisterLogEvents(); + "NONE", "VAN_DRIEST", "MMS", "DifferentialFilterDampingFunction", "DIFF_FILTER_DAMP_", NULL}; // ----------------------------------------------------------------------------- // Structs // ----------------------------------------------------------------------------- // Structs declarations -typedef struct AppCtx_private *AppCtx; -typedef struct CeedData_private *CeedData; -typedef struct User_private *User; -typedef struct Units_private *Units; -typedef struct SimpleBC_private *SimpleBC; -typedef struct Physics_private *Physics; +typedef struct AppCtx_private *AppCtx; +typedef struct CeedData_private *CeedData; +typedef struct User_private *User; +typedef struct Units_private *Units; +typedef struct SimpleBC_private *SimpleBC; +typedef struct Physics_private *Physics; +typedef struct ProblemData_private *ProblemData; // Application context from user command line options struct AppCtx_private { @@ -141,9 +121,6 @@ struct AppCtx_private { PetscViewerFormat viewer_format; PetscBool header_written; } wall_forces; - // Subgrid Stress Model - SGSModelType sgs_model_type; - PetscBool sgs_train_enable; // Differential Filtering PetscBool diff_filter_monitor; MeshTransformType mesh_transform_type; @@ -152,12 +129,9 @@ struct AppCtx_private { // libCEED data struct struct CeedData_private { CeedVector x_coord, q_data; - CeedBasis basis_x, basis_xc, basis_q, basis_x_sur, basis_q_sur; + CeedBasis basis_x, basis_q; CeedElemRestriction elem_restr_x, elem_restr_q, elem_restr_qd_i; - CeedOperator op_setup_vol; OperatorApplyContext op_ics_ctx; - CeedQFunction qf_setup_vol, qf_ics, qf_rhs_vol, qf_ifunction_vol, qf_setup_sur, qf_apply_inflow, qf_apply_inflow_jacobian, qf_apply_outflow, - qf_apply_outflow_jacobian, qf_apply_freestream, qf_apply_freestream_jacobian, qf_apply_slip, qf_apply_slip_jacobian; }; typedef struct { @@ -180,29 +154,6 @@ typedef struct { KSP ksp; } *NodalProjectionData; -typedef PetscErrorCode (*SgsDDNodalStressEval)(User user, Vec Q_loc, Vec VelocityGradient, Vec SGSNodal_loc); -typedef PetscErrorCode (*SgsDDNodalStressInference)(Vec DD_Inputs_loc, Vec DD_Outputs_loc, void *ctx); -typedef struct { - DM dm_sgs, dm_dd_inputs, dm_dd_outputs; - PetscInt num_comp_sgs, num_comp_inputs, num_comp_outputs; - OperatorApplyContext op_nodal_evaluation_ctx, op_nodal_dd_inputs_ctx, op_nodal_dd_outputs_ctx, op_sgs_apply_ctx; - CeedVector sgs_nodal_ceed, grad_velo_ceed; - SgsDDNodalStressEval sgs_nodal_eval; - SgsDDNodalStressInference sgs_nodal_inference; - void *sgs_nodal_inference_ctx; - PetscErrorCode (*sgs_nodal_inference_ctx_destroy)(void *ctx); -} *SgsDDData; - -typedef struct { - DM dm_dd_training; - PetscInt num_comp_dd_inputs, write_data_interval, num_filter_widths; - PetscScalar filter_widths[16]; - OperatorApplyContext op_training_data_calc_ctx; - NodalProjectionData filtered_grad_velo_proj; - size_t training_data_array_dims[2]; - PetscBool overwrite_training_data; -} *SGS_DD_TrainingData; - typedef struct { DM dm_filter; PetscInt num_filtered_fields; @@ -215,12 +166,6 @@ typedef struct { CeedContextFieldLabel filter_width_scaling_label; } *DiffFilterData; -typedef struct { - void *client; - char rank_id_name[16]; - PetscInt collocated_database_num_ranks; -} *SmartSimData; - // PETSc user data struct User_private { MPI_Comm comm; @@ -233,17 +178,14 @@ struct User_private { Physics phys; AppCtx app_ctx; CeedVector q_ceed, q_dot_ceed, g_ceed, x_ceed; - CeedOperator op_rhs_vol, op_ifunction_vol, op_ifunction; + CeedOperator op_ifunction; Mat mat_ijacobian; KSP mass_ksp; OperatorApplyContext op_rhs_ctx, op_strong_bc_ctx; CeedScalar time_bc_set; SpanStatsData spanstats; NodalProjectionData grad_velo_proj; - SgsDDData sgs_dd_data; DiffFilterData diff_filter; - SmartSimData smartsim; - SGS_DD_TrainingData sgs_dd_train; }; // Units @@ -263,12 +205,8 @@ struct Units_private { // Boundary conditions struct SimpleBC_private { - PetscInt num_wall, // Number of faces with wall BCs - wall_comps[5], // An array of constrained component numbers - num_comps, - num_symmetry[3], // Number of faces with symmetry BCs - num_inflow, num_outflow, num_freestream, num_slip; - PetscInt walls[16], symmetries[3][16], inflows[16], outflows[16], freestreams[16], slips[16]; + PetscInt num_inflow, num_outflow, num_freestream, num_slip; + PetscInt inflows[16], outflows[16], freestreams[16], slips[16]; }; // Struct that contains all enums and structs used for the physics of all problems @@ -279,9 +217,10 @@ struct Physics_private { CeedContextFieldLabel stg_solution_time_label; CeedContextFieldLabel timestep_size_label; CeedContextFieldLabel ics_time_label; - CeedContextFieldLabel ijacobian_time_shift_label; }; +PetscErrorCode BoundaryConditionSetUp(User user, ProblemData problem, AppCtx app_ctx, SimpleBC bc); + typedef struct { CeedQFunctionUser qfunction; const char *qfunction_loc; @@ -289,14 +228,15 @@ typedef struct { } ProblemQFunctionSpec; // Problem specific data -typedef struct ProblemData_private *ProblemData; struct ProblemData_private { CeedInt dim, q_data_size_vol, q_data_size_sur, jac_data_size_sur; CeedScalar dm_scale; - ProblemQFunctionSpec setup_vol, setup_sur, ics, apply_vol_rhs, apply_vol_ifunction, apply_vol_ijacobian, apply_inflow, apply_outflow, - apply_freestream, apply_slip, apply_inflow_jacobian, apply_outflow_jacobian, apply_freestream_jacobian, apply_slip_jacobian; - bool non_zero_time; - PetscBool bc_from_ics, use_strong_bc_ceed, uses_newtonian; + ProblemQFunctionSpec ics, apply_vol_rhs, apply_vol_ifunction, apply_vol_ijacobian, apply_inflow, apply_outflow, apply_freestream, apply_slip, + apply_inflow_jacobian, apply_outflow_jacobian, apply_freestream_jacobian, apply_slip_jacobian; + bool compute_exact_solution_error; + PetscBool set_bc_from_ics, use_strong_bc_ceed, uses_newtonian; + PetscCount num_bc_defs; + BCDefinition *bc_defs; PetscErrorCode (*print_info)(User, ProblemData, AppCtx); PetscErrorCode (*create_mass_operator)(User, CeedOperator *); }; @@ -329,7 +269,7 @@ extern PetscErrorCode PRINT_ADVECTION(User user, ProblemData problem, AppCtx app extern PetscErrorCode PRINT_ADVECTION2D(User user, ProblemData problem, AppCtx app_ctx); -PetscErrorCode PrintRunInfo(User user, Physics phys_ctx, ProblemData problem, MPI_Comm comm); +PetscErrorCode PrintRunInfo(User user, Physics phys_ctx, ProblemData problem, TS ts); // ----------------------------------------------------------------------------- // libCEED functions @@ -349,13 +289,14 @@ PetscErrorCode DMPlexCeedElemRestrictionCollocatedCreate(Ceed ceed, DM dm, DMLab PetscErrorCode CreateBasisFromPlex(Ceed ceed, DM dm, DMLabel domain_label, CeedInt label_value, CeedInt height, CeedInt dm_field, CeedBasis *basis); -// Utility function to create CEED Composite Operator for the entire domain -PetscErrorCode CreateOperatorForDomain(Ceed ceed, DM dm, SimpleBC bc, CeedData ceed_data, Physics phys, CeedOperator op_apply_vol, - CeedOperator op_apply_ijacobian_vol, CeedInt height, CeedInt P_sur, CeedInt Q_sur, CeedInt q_data_size_sur, - CeedInt jac_data_size_sur, CeedOperator *op_apply, CeedOperator *op_apply_ijacobian); - PetscErrorCode SetupLibceed(Ceed ceed, CeedData ceed_data, DM dm, User user, AppCtx app_ctx, ProblemData problem, SimpleBC bc); +PetscErrorCode QDataGet(Ceed ceed, DM dm, DMLabel domain_label, PetscInt label_value, CeedElemRestriction elem_restr_x, CeedBasis basis_x, + CeedVector x_coord, CeedElemRestriction *elem_restr_qd, CeedVector *q_data, CeedInt *q_data_size); +PetscErrorCode QDataGetNumComponents(DM dm, CeedInt *q_data_size); +PetscErrorCode QDataBoundaryGet(Ceed ceed, DM dm, DMLabel domain_label, PetscInt label_value, CeedElemRestriction elem_restr_x, CeedBasis basis_x, + CeedVector x_coord, CeedElemRestriction *elem_restr_qd, CeedVector *q_data, CeedInt *q_data_size); +PetscErrorCode QDataBoundaryGetNumComponents(DM dm, CeedInt *q_data_size); // ----------------------------------------------------------------------------- // Time-stepping functions // ----------------------------------------------------------------------------- @@ -369,7 +310,7 @@ PetscErrorCode IFunction_NS(TS ts, PetscReal t, Vec Q, Vec Q_dot, Vec G, void *u PetscErrorCode TSMonitor_NS(TS ts, PetscInt step_no, PetscReal time, Vec Q, void *ctx); // TS: Create, setup, and solve -PetscErrorCode TSSolve_NS(DM dm, User user, AppCtx app_ctx, Physics phys, Vec *Q, PetscScalar *f_time, TS *ts); +PetscErrorCode TSSolve_NS(DM dm, User user, AppCtx app_ctx, Physics phys, ProblemData problem, Vec *Q, PetscScalar *f_time, TS *ts); // Update Boundary Values when time has changed PetscErrorCode UpdateBoundaryValues(User user, Vec Q_loc, PetscReal t); @@ -454,10 +395,6 @@ PetscErrorCode TurbulenceStatisticsDestroy(User user, CeedData ceed_data); // ----------------------------------------------------------------------------- // Data-Driven Subgrid Stress (DD-SGS) Modeling Functions // ----------------------------------------------------------------------------- - -PetscErrorCode SgsDDSetup(Ceed ceed, User user, CeedData ceed_data, ProblemData problem); -PetscErrorCode SgsDDDataDestroy(SgsDDData sgs_dd_data); -PetscErrorCode SgsDDApplyIFunction(User user, const Vec Q_loc, Vec G_loc); PetscErrorCode VelocityGradientProjectionSetup(Ceed ceed, User user, CeedData ceed_data, ProblemData problem, StateVariable state_var_input, CeedElemRestriction elem_restr_input, CeedBasis basis_input, NodalProjectionData *pgrad_velo_proj); PetscErrorCode VelocityGradientProjectionApply(NodalProjectionData grad_velo_proj, Vec Q_loc, Vec VelocityGradient); @@ -486,13 +423,3 @@ PetscErrorCode DifferentialFilterDataDestroy(DiffFilterData diff_filter); PetscErrorCode TSMonitor_DifferentialFilter(TS ts, PetscInt steps, PetscReal solution_time, Vec Q, void *ctx); PetscErrorCode DifferentialFilterApply(User user, const PetscReal solution_time, const Vec Q, Vec Filtered_Solution); PetscErrorCode DifferentialFilterMmsICSetup(ProblemData problem); - -// ----------------------------------------------------------------------------- -// SGS Data-Driven Training via SmartSim -// ----------------------------------------------------------------------------- -PetscErrorCode SmartSimSetup(User user); -PetscErrorCode SmartSimDataDestroy(SmartSimData smartsim); -PetscErrorCode SGS_DD_TrainingSetup(Ceed ceed, User user, CeedData ceed_data, ProblemData problem); -PetscErrorCode TSMonitor_SGS_DD_Training(TS ts, PetscInt step_num, PetscReal solution_time, Vec Q, void *ctx); -PetscErrorCode TSPostStep_SGS_DD_Training(TS ts); -PetscErrorCode SGS_DD_TrainingDataDestroy(SGS_DD_TrainingData sgs_dd_train); diff --git a/examples/fluids/problems/advection.c b/examples/fluids/problems/advection.c index 1d29b2cddb..79275a231c 100644 --- a/examples/fluids/problems/advection.c +++ b/examples/fluids/problems/advection.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -14,8 +14,6 @@ #include #include "../navierstokes.h" -#include "../qfunctions/setupgeo.h" -#include "../qfunctions/setupgeo2d.h" // @brief Create CeedOperator for stabilized mass KSP for explicit timestepping // @@ -37,14 +35,12 @@ PetscErrorCode CreateKSPMassOperator_AdvectionStabilized(User user, CeedOperator CeedOperatorField field; PetscInt sub_op_index = 0; // will be 0 for the volume op - PetscCallCeed(ceed, CeedCompositeOperatorGetSubList(user->op_rhs_ctx->op, &sub_ops)); + PetscCallCeed(ceed, CeedOperatorCompositeGetSubList(user->op_rhs_ctx->op, &sub_ops)); PetscCallCeed(ceed, CeedOperatorGetFieldByName(sub_ops[sub_op_index], "q", &field)); - PetscCallCeed(ceed, CeedOperatorFieldGetElemRestriction(field, &elem_restr_q)); - PetscCallCeed(ceed, CeedOperatorFieldGetBasis(field, &basis_q)); + PetscCallCeed(ceed, CeedOperatorFieldGetData(field, NULL, &elem_restr_q, &basis_q, NULL)); PetscCallCeed(ceed, CeedOperatorGetFieldByName(sub_ops[sub_op_index], "qdata", &field)); - PetscCallCeed(ceed, CeedOperatorFieldGetElemRestriction(field, &elem_restr_qd_i)); - PetscCallCeed(ceed, CeedOperatorFieldGetVector(field, &q_data)); + PetscCallCeed(ceed, CeedOperatorFieldGetData(field, NULL, &elem_restr_qd_i, NULL, &q_data)); PetscCallCeed(ceed, CeedOperatorGetContext(sub_ops[sub_op_index], &qf_ctx)); } @@ -76,6 +72,11 @@ PetscErrorCode CreateKSPMassOperator_AdvectionStabilized(User user, CeedOperator PetscCallCeed(ceed, CeedOperatorSetField(*op_mass, "v", elem_restr_q, basis_q, CEED_VECTOR_ACTIVE)); PetscCallCeed(ceed, CeedOperatorSetField(*op_mass, "Grad_v", elem_restr_q, basis_q, CEED_VECTOR_ACTIVE)); + PetscCallCeed(ceed, CeedVectorDestroy(&q_data)); + PetscCallCeed(ceed, CeedElemRestrictionDestroy(&elem_restr_q)); + PetscCallCeed(ceed, CeedElemRestrictionDestroy(&elem_restr_qd_i)); + PetscCallCeed(ceed, CeedBasisDestroy(&basis_q)); + PetscCallCeed(ceed, CeedQFunctionContextDestroy(&qf_ctx)); PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_mass)); PetscFunctionReturn(PETSC_SUCCESS); } @@ -106,12 +107,6 @@ PetscErrorCode NS_ADVECTION(ProblemData problem, DM dm, void *ctx, SimpleBC bc) switch (dim) { case 2: problem->dim = 2; - problem->q_data_size_vol = 5; - problem->q_data_size_sur = 3; - problem->setup_vol.qfunction = Setup2d; - problem->setup_vol.qfunction_loc = Setup2d_loc; - problem->setup_sur.qfunction = SetupBoundary2d; - problem->setup_sur.qfunction_loc = SetupBoundary2d_loc; problem->ics.qfunction = ICsAdvection2d; problem->ics.qfunction_loc = ICsAdvection2d_loc; problem->apply_vol_rhs.qfunction = RHS_Advection2d; @@ -120,17 +115,11 @@ PetscErrorCode NS_ADVECTION(ProblemData problem, DM dm, void *ctx, SimpleBC bc) problem->apply_vol_ifunction.qfunction_loc = IFunction_Advection2d_loc; problem->apply_inflow.qfunction = Advection2d_InOutFlow; problem->apply_inflow.qfunction_loc = Advection2d_InOutFlow_loc; - problem->non_zero_time = PETSC_TRUE; + problem->compute_exact_solution_error = PETSC_TRUE; problem->print_info = PRINT_ADVECTION; break; case 3: problem->dim = 3; - problem->q_data_size_vol = 10; - problem->q_data_size_sur = 10; - problem->setup_vol.qfunction = Setup; - problem->setup_vol.qfunction_loc = Setup_loc; - problem->setup_sur.qfunction = SetupBoundary; - problem->setup_sur.qfunction_loc = SetupBoundary_loc; problem->ics.qfunction = ICsAdvection; problem->ics.qfunction_loc = ICsAdvection_loc; problem->apply_vol_rhs.qfunction = RHS_Advection; @@ -139,7 +128,7 @@ PetscErrorCode NS_ADVECTION(ProblemData problem, DM dm, void *ctx, SimpleBC bc) problem->apply_vol_ifunction.qfunction_loc = IFunction_Advection_loc; problem->apply_inflow.qfunction = Advection_InOutFlow; problem->apply_inflow.qfunction_loc = Advection_InOutFlow_loc; - problem->non_zero_time = PETSC_FALSE; + problem->compute_exact_solution_error = PETSC_FALSE; problem->print_info = PRINT_ADVECTION; break; } @@ -210,8 +199,8 @@ PetscErrorCode NS_ADVECTION(ProblemData problem, DM dm, void *ctx, SimpleBC bc) } if (wind_type == WIND_TRANSLATION && advectionic_type == ADVECTIONIC_BUBBLE_CYLINDER && wind[2] != 0.) { wind[2] = 0; - PetscCall( - PetscPrintf(comm, "Warning! Background wind in the z direction should be zero (-wind_translation x,x,0) with -advection_ic_type cylinder\n")); + PetscCall(PetscPrintf(comm, + "Warning! Background wind in the z direction should be zero (-wind_translation x,x,0) with -advection_ic_type cylinder\n")); } if (stab == STAB_NONE && CtauS != 0) { PetscCall(PetscPrintf(comm, "Warning! Use -CtauS only with -stab su or -stab supg\n")); @@ -309,8 +298,8 @@ PetscErrorCode PRINT_ADVECTION(User user, ProblemData problem, AppCtx app_ctx) { PetscCall(PetscPrintf(comm, " Background Wind : %f,%f\n", setup_ctx->wind[0], setup_ctx->wind[1])); break; case 3: - PetscCall( - PetscPrintf(comm, " Background Wind : %f,%f,%f\n", setup_ctx->wind[0], setup_ctx->wind[1], setup_ctx->wind[2])); + PetscCall(PetscPrintf(comm, " Background Wind : %f,%f,%f\n", setup_ctx->wind[0], setup_ctx->wind[1], + setup_ctx->wind[2])); break; } } diff --git a/examples/fluids/problems/bc_freestream.c b/examples/fluids/problems/bc_freestream.c index cff3c74e67..b2f23f786e 100644 --- a/examples/fluids/problems/bc_freestream.c +++ b/examples/fluids/problems/bc_freestream.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -16,7 +16,9 @@ #include "../navierstokes.h" #include "../qfunctions/newtonian_types.h" -static const char *const RiemannSolverTypes[] = {"hll", "hllc", "RiemannSolverTypes", "RIEMANN_", NULL}; +static const char *const RiemannSolverTypes[] = {"HLL", "HLLC", "RiemannSolverTypes", "RIEMANN_", NULL}; + +static PetscErrorCode RiemannSolverUnitTests(NewtonianIdealGasContext gas, CeedScalar rtol); PetscErrorCode FreestreamBCSetup(ProblemData problem, DM dm, void *ctx, NewtonianIdealGasContext newtonian_ig_ctx, const StatePrimitive *reference) { User user = *(User *)ctx; @@ -78,6 +80,22 @@ PetscErrorCode FreestreamBCSetup(ProblemData problem, DM dm, void *ctx, Newtonia break; } break; + case STATEVAR_ENTROPY: + switch (riemann) { + case RIEMANN_HLL: + problem->apply_freestream.qfunction = Freestream_Entropy_HLL; + problem->apply_freestream.qfunction_loc = Freestream_Entropy_HLL_loc; + problem->apply_freestream_jacobian.qfunction = Freestream_Jacobian_Entropy_HLL; + problem->apply_freestream_jacobian.qfunction_loc = Freestream_Jacobian_Entropy_HLL_loc; + break; + case RIEMANN_HLLC: + problem->apply_freestream.qfunction = Freestream_Entropy_HLLC; + problem->apply_freestream.qfunction_loc = Freestream_Entropy_HLLC_loc; + problem->apply_freestream_jacobian.qfunction = Freestream_Jacobian_Entropy_HLLC; + problem->apply_freestream_jacobian.qfunction_loc = Freestream_Jacobian_Entropy_HLLC_loc; + break; + } + break; } Y_inf.pressure *= Pascal; @@ -96,6 +114,13 @@ PetscErrorCode FreestreamBCSetup(ProblemData problem, DM dm, void *ctx, Newtonia PetscCallCeed(ceed, CeedQFunctionContextSetDataDestroy(freestream_context, CEED_MEM_HOST, FreeContextPetsc)); problem->apply_freestream.qfunction_context = freestream_context; PetscCallCeed(ceed, CeedQFunctionContextReferenceCopy(freestream_context, &problem->apply_freestream_jacobian.qfunction_context)); + + { + PetscBool run_unit_tests = PETSC_FALSE; + + PetscCall(PetscOptionsGetBool(NULL, NULL, "-riemann_solver_unit_tests", &run_unit_tests, NULL)); + if (run_unit_tests) PetscCall(RiemannSolverUnitTests(newtonian_ig_ctx, 5e-7)); + } PetscFunctionReturn(PETSC_SUCCESS); } @@ -119,13 +144,13 @@ PetscErrorCode OutflowBCSetup(ProblemData problem, DM dm, void *ctx, NewtonianId CeedScalar temperature = reference->temperature / Kelvin; CeedScalar recirc = 1, softplus_velocity = 1e-2; PetscOptionsBegin(user->comm, NULL, "Options for Outflow boundary condition", NULL); - PetscCall( - PetscOptionsEnum("-outflow_type", "Type of outflow condition", NULL, OutflowTypes, (PetscEnum)outflow_type, (PetscEnum *)&outflow_type, NULL)); + PetscCall(PetscOptionsEnum("-outflow_type", "Type of outflow condition", NULL, OutflowTypes, (PetscEnum)outflow_type, (PetscEnum *)&outflow_type, + NULL)); PetscCall(PetscOptionsScalar("-outflow_pressure", "Pressure at outflow condition", NULL, pressure, &pressure, NULL)); if (outflow_type == OUTFLOW_RIEMANN) { PetscCall(PetscOptionsScalar("-outflow_temperature", "Temperature at outflow condition", NULL, temperature, &temperature, NULL)); - PetscCall( - PetscOptionsReal("-outflow_recirc", "Fraction of recirculation to allow in exterior velocity state [0,1]", NULL, recirc, &recirc, NULL)); + PetscCall(PetscOptionsReal("-outflow_recirc", "Fraction of recirculation to allow in exterior velocity state [0,1]", NULL, recirc, &recirc, + NULL)); PetscCall(PetscOptionsReal("-outflow_softplus_velocity", "Characteristic velocity of softplus regularization", NULL, softplus_velocity, &softplus_velocity, NULL)); } @@ -148,6 +173,12 @@ PetscErrorCode OutflowBCSetup(ProblemData problem, DM dm, void *ctx, NewtonianId problem->apply_outflow_jacobian.qfunction = RiemannOutflow_Jacobian_Prim; problem->apply_outflow_jacobian.qfunction_loc = RiemannOutflow_Jacobian_Prim_loc; break; + case STATEVAR_ENTROPY: + problem->apply_outflow.qfunction = RiemannOutflow_Entropy; + problem->apply_outflow.qfunction_loc = RiemannOutflow_Entropy_loc; + problem->apply_outflow_jacobian.qfunction = RiemannOutflow_Jacobian_Entropy; + problem->apply_outflow_jacobian.qfunction_loc = RiemannOutflow_Jacobian_Entropy_loc; + break; } break; case OUTFLOW_PRESSURE: @@ -164,6 +195,12 @@ PetscErrorCode OutflowBCSetup(ProblemData problem, DM dm, void *ctx, NewtonianId problem->apply_outflow_jacobian.qfunction = PressureOutflow_Jacobian_Prim; problem->apply_outflow_jacobian.qfunction_loc = PressureOutflow_Jacobian_Prim_loc; break; + case STATEVAR_ENTROPY: + problem->apply_outflow.qfunction = PressureOutflow_Entropy; + problem->apply_outflow.qfunction_loc = PressureOutflow_Entropy_loc; + problem->apply_outflow_jacobian.qfunction = PressureOutflow_Jacobian_Entropy; + problem->apply_outflow_jacobian.qfunction_loc = PressureOutflow_Jacobian_Entropy_loc; + break; } break; } @@ -181,3 +218,336 @@ PetscErrorCode OutflowBCSetup(ProblemData problem, DM dm, void *ctx, NewtonianId PetscCallCeed(ceed, CeedQFunctionContextReferenceCopy(outflow_context, &problem->apply_outflow_jacobian.qfunction_context)); PetscFunctionReturn(PETSC_SUCCESS); } + +// @brief Calculate relative error, (A - B) / S +// If S < threshold, then set S=1 +static inline CeedScalar RelativeError(CeedScalar S, CeedScalar A, CeedScalar B, CeedScalar threshold) { + return (A - B) / (fabs(S) > threshold ? S : 1); +} + +// @brief Check errors of a State vector and print if above tolerance +static PetscErrorCode CheckQWithTolerance(const CeedScalar Q_s[5], const CeedScalar Q_a[5], const CeedScalar Q_b[5], const char *name, + PetscReal rtol_0, PetscReal rtol_u, PetscReal rtol_4) { + CeedScalar relative_error[5]; // relative error + CeedScalar divisor_threshold = 10 * CEED_EPSILON; + + PetscFunctionBeginUser; + relative_error[0] = RelativeError(Q_s[0], Q_a[0], Q_b[0], divisor_threshold); + relative_error[4] = RelativeError(Q_s[4], Q_a[4], Q_b[4], divisor_threshold); + + CeedScalar u_magnitude = sqrt(Square(Q_s[1]) + Square(Q_s[2]) + Square(Q_s[3])); + for (int i = 1; i < 4; i++) { + relative_error[i] = RelativeError(u_magnitude, Q_a[i], Q_b[i], divisor_threshold); + } + + if (fabs(relative_error[0]) >= rtol_0) { + printf("%s[0] error %g (expected %.10e, got %.10e)\n", name, relative_error[0], Q_s[0], Q_a[0]); + } + for (int i = 1; i < 4; i++) { + if (fabs(relative_error[i]) >= rtol_u) { + printf("%s[%d] error %g (expected %.10e, got %.10e)\n", name, i, relative_error[i], Q_s[i], Q_a[i]); + } + } + if (fabs(relative_error[4]) >= rtol_4) { + printf("%s[4] error %g (expected %.10e, got %.10e)\n", name, relative_error[4], Q_s[4], Q_a[4]); + } + PetscFunctionReturn(PETSC_SUCCESS); +} + +// @brief Verify RiemannFlux_HLL_fwd function against finite-difference approximation +static PetscErrorCode TestRiemannHLL_fwd(NewtonianIdealGasContext gas, CeedScalar rtol_0, CeedScalar rtol_u, CeedScalar rtol_4) { + CeedScalar eps = 4e-7; // Finite difference step + char buf[128]; + const CeedScalar T = 200; + const CeedScalar rho = 1.2; + const CeedScalar p = (HeatCapacityRatio(gas) - 1) * rho * gas->cv * T; + const CeedScalar u_base = 40; + const CeedScalar u[3] = {u_base, u_base * 1.1, u_base * 1.2}; + const CeedScalar Y0_left[5] = {p, u[0], u[1], u[2], T}; + const CeedScalar Y0_right[5] = {1.2 * p, 1.2 * u[0], 1.2 * u[1], 1.2 * u[2], 1.2 * T}; + CeedScalar normal[3] = {1, 2, 3}; + + PetscFunctionBeginUser; + State left0 = StateFromY(gas, Y0_left); + State right0 = StateFromY(gas, Y0_right); + ScaleN(normal, 1 / sqrt(Dot3(normal, normal)), 3); + + for (int i = 0; i < 10; i++) { + CeedScalar dFlux[5] = {0.}, dFlux_fd[5] = {0.}; + { // Calculate dFlux using *_fwd function + CeedScalar dY_right[5] = {0}; + CeedScalar dY_left[5] = {0}; + + if (i < 5) { + dY_left[i] = Y0_left[i]; + } else { + dY_right[i % 5] = Y0_right[i % 5]; + } + State dleft0 = StateFromY_fwd(gas, left0, dY_left); + State dright0 = StateFromY_fwd(gas, right0, dY_right); + + StateConservative dFlux_state = RiemannFlux_HLL_fwd(gas, left0, dleft0, right0, dright0, normal); + UnpackState_U(dFlux_state, dFlux); + } + + { // Calculate dFlux_fd via finite difference approximation + CeedScalar Y1_left[5] = {Y0_left[0], Y0_left[1], Y0_left[2], Y0_left[3], Y0_left[4]}; + CeedScalar Y1_right[5] = {Y0_right[0], Y0_right[1], Y0_right[2], Y0_right[3], Y0_right[4]}; + CeedScalar Flux0[5], Flux1[5]; + + if (i < 5) { + Y1_left[i] *= 1 + eps; + } else { + Y1_right[i % 5] *= 1 + eps; + } + State left1 = StateFromY(gas, Y1_left); + State right1 = StateFromY(gas, Y1_right); + + StateConservative Flux0_state = RiemannFlux_HLL(gas, left0, right0, normal); + StateConservative Flux1_state = RiemannFlux_HLL(gas, left1, right1, normal); + UnpackState_U(Flux0_state, Flux0); + UnpackState_U(Flux1_state, Flux1); + for (int j = 0; j < 5; j++) dFlux_fd[j] = (Flux1[j] - Flux0[j]) / eps; + } + + snprintf(buf, sizeof buf, "RiemannFlux_HLL i=%d: Flux", i); + PetscCall(CheckQWithTolerance(dFlux_fd, dFlux, dFlux_fd, buf, rtol_0, rtol_u, rtol_4)); + } + PetscFunctionReturn(PETSC_SUCCESS); +} + +// @brief Verify RiemannFlux_HLLC_fwd function against finite-difference approximation +static PetscErrorCode TestRiemannHLLC_fwd(NewtonianIdealGasContext gas, CeedScalar rtol_0, CeedScalar rtol_u, CeedScalar rtol_4) { + CeedScalar eps = 4e-7; // Finite difference step + char buf[128]; + const CeedScalar T = 200; + const CeedScalar rho = 1.2; + const CeedScalar p = (HeatCapacityRatio(gas) - 1) * rho * gas->cv * T; + const CeedScalar u_base = 40; + const CeedScalar u[3] = {u_base, u_base * 1.1, u_base * 1.2}; + const CeedScalar Y0_left[5] = {p, u[0], u[1], u[2], T}; + const CeedScalar Y0_right[5] = {1.2 * p, 1.2 * u[0], 1.2 * u[1], 1.2 * u[2], 1.2 * T}; + CeedScalar normal[3] = {1, 2, 3}; + + PetscFunctionBeginUser; + State left0 = StateFromY(gas, Y0_left); + State right0 = StateFromY(gas, Y0_right); + ScaleN(normal, 1 / sqrt(Dot3(normal, normal)), 3); + + for (int i = 0; i < 10; i++) { + CeedScalar dFlux[5] = {0.}, dFlux_fd[5] = {0.}; + { // Calculate dFlux using *_fwd function + CeedScalar dY_right[5] = {0}; + CeedScalar dY_left[5] = {0}; + + if (i < 5) { + dY_left[i] = Y0_left[i]; + } else { + dY_right[i % 5] = Y0_right[i % 5]; + } + State dleft0 = StateFromY_fwd(gas, left0, dY_left); + State dright0 = StateFromY_fwd(gas, right0, dY_right); + + StateConservative dFlux_state = RiemannFlux_HLLC_fwd(gas, left0, dleft0, right0, dright0, normal); + UnpackState_U(dFlux_state, dFlux); + } + + { // Calculate dFlux_fd via finite difference approximation + CeedScalar Y1_left[5] = {Y0_left[0], Y0_left[1], Y0_left[2], Y0_left[3], Y0_left[4]}; + CeedScalar Y1_right[5] = {Y0_right[0], Y0_right[1], Y0_right[2], Y0_right[3], Y0_right[4]}; + CeedScalar Flux0[5], Flux1[5]; + + if (i < 5) { + Y1_left[i] *= 1 + eps; + } else { + Y1_right[i % 5] *= 1 + eps; + } + State left1 = StateFromY(gas, Y1_left); + State right1 = StateFromY(gas, Y1_right); + + StateConservative Flux0_state = RiemannFlux_HLLC(gas, left0, right0, normal); + StateConservative Flux1_state = RiemannFlux_HLLC(gas, left1, right1, normal); + UnpackState_U(Flux0_state, Flux0); + UnpackState_U(Flux1_state, Flux1); + for (int j = 0; j < 5; j++) dFlux_fd[j] = (Flux1[j] - Flux0[j]) / eps; + } + + snprintf(buf, sizeof buf, "RiemannFlux_HLLC i=%d: Flux", i); + PetscCall(CheckQWithTolerance(dFlux_fd, dFlux, dFlux_fd, buf, rtol_0, rtol_u, rtol_4)); + } + PetscFunctionReturn(PETSC_SUCCESS); +} + +// @brief Verify ComputeHLLSpeeds_Roe_fwd function against finite-difference approximation +static PetscErrorCode TestComputeHLLSpeeds_Roe_fwd(NewtonianIdealGasContext gas, CeedScalar rtol) { + CeedScalar eps = 4e-7; // Finite difference step + char buf[128]; + const CeedScalar T = 200; + const CeedScalar rho = 1.2; + const CeedScalar p = (HeatCapacityRatio(gas) - 1) * rho * gas->cv * T; + const CeedScalar u_base = 40; + const CeedScalar u[3] = {u_base, u_base * 1.1, u_base * 1.2}; + const CeedScalar Y0_left[5] = {p, u[0], u[1], u[2], T}; + const CeedScalar Y0_right[5] = {1.2 * p, 1.2 * u[0], 1.2 * u[1], 1.2 * u[2], 1.2 * T}; + CeedScalar normal[3] = {1, 2, 3}; + + PetscFunctionBeginUser; + State left0 = StateFromY(gas, Y0_left); + State right0 = StateFromY(gas, Y0_right); + ScaleN(normal, 1 / sqrt(Dot3(normal, normal)), 3); + CeedScalar u_left0 = Dot3(left0.Y.velocity, normal); + CeedScalar u_right0 = Dot3(right0.Y.velocity, normal); + + for (int i = 0; i < 10; i++) { + CeedScalar ds_left, ds_right, ds_left_fd, ds_right_fd; + { // Calculate ds_{left,right} using *_fwd function + CeedScalar dY_right[5] = {0}; + CeedScalar dY_left[5] = {0}; + + if (i < 5) { + dY_left[i] = Y0_left[i]; + } else { + dY_right[i % 5] = Y0_right[i % 5]; + } + State dleft0 = StateFromY_fwd(gas, left0, dY_left); + State dright0 = StateFromY_fwd(gas, right0, dY_right); + CeedScalar du_left = Dot3(dleft0.Y.velocity, normal); + CeedScalar du_right = Dot3(dright0.Y.velocity, normal); + + CeedScalar s_left, s_right; // Throw away + ComputeHLLSpeeds_Roe_fwd(gas, left0, dleft0, u_left0, du_left, right0, dright0, u_right0, du_right, &s_left, &ds_left, &s_right, &ds_right); + } + + { // Calculate ds_{left,right}_fd via finite difference approximation + CeedScalar Y1_left[5] = {Y0_left[0], Y0_left[1], Y0_left[2], Y0_left[3], Y0_left[4]}; + CeedScalar Y1_right[5] = {Y0_right[0], Y0_right[1], Y0_right[2], Y0_right[3], Y0_right[4]}; + + if (i < 5) { + Y1_left[i] *= 1 + eps; + } else { + Y1_right[i % 5] *= 1 + eps; + } + State left1 = StateFromY(gas, Y1_left); + State right1 = StateFromY(gas, Y1_right); + CeedScalar u_left1 = Dot3(left1.Y.velocity, normal); + CeedScalar u_right1 = Dot3(right1.Y.velocity, normal); + + CeedScalar s_left0, s_right0, s_left1, s_right1; + ComputeHLLSpeeds_Roe(gas, left0, u_left0, right0, u_right0, &s_left0, &s_right0); + ComputeHLLSpeeds_Roe(gas, left1, u_left1, right1, u_right1, &s_left1, &s_right1); + ds_left_fd = (s_left1 - s_left0) / eps; + ds_right_fd = (s_right1 - s_right0) / eps; + } + + snprintf(buf, sizeof buf, "ComputeHLLSpeeds_Roe i=%d:", i); + { + CeedScalar divisor_threshold = 10 * CEED_EPSILON; + CeedScalar ds_left_err, ds_right_err; + + ds_left_err = RelativeError(ds_left_fd, ds_left, ds_left_fd, divisor_threshold); + ds_right_err = RelativeError(ds_right_fd, ds_right, ds_right_fd, divisor_threshold); + if (fabs(ds_left_err) >= rtol) printf("%s ds_left error %g (expected %.10e, got %.10e)\n", buf, ds_left_err, ds_left_fd, ds_left); + if (fabs(ds_right_err) >= rtol) printf("%s ds_right error %g (expected %.10e, got %.10e)\n", buf, ds_right_err, ds_right_fd, ds_right); + } + } + PetscFunctionReturn(PETSC_SUCCESS); +} + +// @brief Verify TotalSpecificEnthalpy_fwd function against finite-difference approximation +static PetscErrorCode TestTotalSpecificEnthalpy_fwd(NewtonianIdealGasContext gas, CeedScalar rtol) { + CeedScalar eps = 4e-7; // Finite difference step + char buf[128]; + const CeedScalar T = 200; + const CeedScalar rho = 1.2; + const CeedScalar p = (HeatCapacityRatio(gas) - 1) * rho * gas->cv * T; + const CeedScalar u_base = 40; + const CeedScalar u[3] = {u_base, u_base * 1.1, u_base * 1.2}; + const CeedScalar Y0[5] = {p, u[0], u[1], u[2], T}; + + PetscFunctionBeginUser; + State state0 = StateFromY(gas, Y0); + + for (int i = 0; i < 5; i++) { + CeedScalar dH, dH_fd; + { // Calculate dH using *_fwd function + CeedScalar dY[5] = {0}; + + dY[i] = Y0[i]; + State dstate0 = StateFromY_fwd(gas, state0, dY); + dH = TotalSpecificEnthalpy_fwd(gas, state0, dstate0); + } + + { // Calculate dH_fd via finite difference approximation + CeedScalar H0, H1; + CeedScalar Y1[5] = {Y0[0], Y0[1], Y0[2], Y0[3], Y0[4]}; + Y1[i] *= 1 + eps; + State state1 = StateFromY(gas, Y1); + + H0 = TotalSpecificEnthalpy(gas, state0); + H1 = TotalSpecificEnthalpy(gas, state1); + dH_fd = (H1 - H0) / eps; + } + + snprintf(buf, sizeof buf, "TotalSpecificEnthalpy i=%d:", i); + { + CeedScalar divisor_threshold = 10 * CEED_EPSILON; + CeedScalar dH_err; + + dH_err = RelativeError(dH_fd, dH, dH_fd, divisor_threshold); + if (fabs(dH_err) >= rtol) printf("%s dH error %g (expected %.10e, got %.10e)\n", buf, dH_err, dH_fd, dH); + } + } + PetscFunctionReturn(PETSC_SUCCESS); +} + +// @brief Verify RoeSetup_fwd function against finite-difference approximation +static PetscErrorCode TestRowSetup_fwd(NewtonianIdealGasContext gas, CeedScalar rtol) { + CeedScalar eps = 4e-7; // Finite difference step + char buf[128]; + const CeedScalar rho0[2] = {1.2, 1.4}; + + PetscFunctionBeginUser; + for (int i = 0; i < 2; i++) { + RoeWeights dR, dR_fd; + { // Calculate using *_fwd function + CeedScalar drho[5] = {0}; + + drho[i] = rho0[i]; + dR = RoeSetup_fwd(rho0[0], rho0[1], drho[0], drho[1]); + } + + { // Calculate via finite difference approximation + RoeWeights R0, R1; + CeedScalar rho1[5] = {rho0[0], rho0[1]}; + rho1[i] *= 1 + eps; + + R0 = RoeSetup(rho0[0], rho0[1]); + R1 = RoeSetup(rho1[0], rho1[1]); + dR_fd.left = (R1.left - R0.left) / eps; + dR_fd.right = (R1.right - R0.right) / eps; + } + + snprintf(buf, sizeof buf, "RoeSetup i=%d:", i); + { + CeedScalar divisor_threshold = 10 * CEED_EPSILON; + RoeWeights dR_err; + + dR_err.left = RelativeError(dR_fd.left, dR.left, dR_fd.left, divisor_threshold); + dR_err.right = RelativeError(dR_fd.right, dR.right, dR_fd.right, divisor_threshold); + if (fabs(dR_err.left) >= rtol) printf("%s dR.left error %g (expected %.10e, got %.10e)\n", buf, dR_err.left, dR_fd.left, dR.left); + if (fabs(dR_err.right) >= rtol) printf("%s dR.right error %g (expected %.10e, got %.10e)\n", buf, dR_err.right, dR_fd.right, dR.right); + } + } + PetscFunctionReturn(PETSC_SUCCESS); +} + +// @brief Test Riemann solver related `*_fwd` functions via finite-difference approximation +static PetscErrorCode RiemannSolverUnitTests(NewtonianIdealGasContext gas, CeedScalar rtol) { + PetscFunctionBeginUser; + PetscCall(TestRiemannHLL_fwd(gas, rtol, rtol, rtol)); + PetscCall(TestRiemannHLLC_fwd(gas, rtol, rtol, rtol)); + PetscCall(TestComputeHLLSpeeds_Roe_fwd(gas, rtol)); + PetscCall(TestTotalSpecificEnthalpy_fwd(gas, rtol)); + PetscCall(TestRowSetup_fwd(gas, rtol)); + PetscFunctionReturn(PETSC_SUCCESS); +} diff --git a/examples/fluids/problems/bc_slip.c b/examples/fluids/problems/bc_slip.c index 4b6708436e..727188dfe0 100644 --- a/examples/fluids/problems/bc_slip.c +++ b/examples/fluids/problems/bc_slip.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -34,6 +34,12 @@ PetscErrorCode SlipBCSetup(ProblemData problem, DM dm, void *ctx, CeedQFunctionC problem->apply_slip_jacobian.qfunction = Slip_Jacobian_Prim; problem->apply_slip_jacobian.qfunction_loc = Slip_Jacobian_Prim_loc; break; + case STATEVAR_ENTROPY: + problem->apply_slip.qfunction = Slip_Entropy; + problem->apply_slip.qfunction_loc = Slip_Entropy_loc; + problem->apply_slip_jacobian.qfunction = Slip_Jacobian_Entropy; + problem->apply_slip_jacobian.qfunction_loc = Slip_Jacobian_Entropy_loc; + break; } PetscCallCeed(ceed, CeedQFunctionContextReferenceCopy(newtonian_ig_qfctx, &problem->apply_slip.qfunction_context)); diff --git a/examples/fluids/problems/blasius.c b/examples/fluids/problems/blasius.c index 4fbfc977a5..b9f3654046 100644 --- a/examples/fluids/problems/blasius.c +++ b/examples/fluids/problems/blasius.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -21,10 +21,12 @@ PetscErrorCode CompressibleBlasiusResidual(SNES snes, Vec X, Vec R, void *ctx) { const BlasiusContext blasius = (BlasiusContext)ctx; const PetscScalar *Tf, *Th; // Chebyshev coefficients PetscScalar *r, f[4], h[4]; - PetscInt N = blasius->n_cheb; + PetscInt N = blasius->n_cheb; + State S_infty = blasius->S_infty; + CeedScalar U_infty = sqrt(Dot3(S_infty.Y.velocity, S_infty.Y.velocity)); PetscFunctionBeginUser; - PetscScalar Ma = Mach(&blasius->newtonian_ctx, blasius->T_inf, blasius->U_inf), Pr = Prandtl(&blasius->newtonian_ctx), + PetscScalar Ma = Mach(&blasius->newtonian_ctx, S_infty.Y.temperature, U_infty), Pr = Prandtl(&blasius->newtonian_ctx), gamma = HeatCapacityRatio(&blasius->newtonian_ctx); PetscCall(VecGetArrayRead(X, &Tf)); @@ -59,7 +61,7 @@ PetscErrorCode CompressibleBlasiusResidual(SNES snes, Vec X, Vec R, void *ctx) { // h - left end boundary condition ChebyshevEval(N - 1, Th, -1., blasius->eta_max, h); - r[N] = h[0] - blasius->T_wall / blasius->T_inf; + r[N] = h[0] - blasius->T_wall / S_infty.Y.temperature; // h - right end boundary condition ChebyshevEval(N - 1, Th, 1., blasius->eta_max, h); @@ -117,26 +119,33 @@ static PetscErrorCode GetYNodeLocs(const MPI_Comm comm, const char path[PETSC_MA FILE *fp; const PetscInt char_array_len = 512; char line[char_array_len]; - char **array; PetscReal *node_locs; PetscFunctionBeginUser; PetscCall(PetscFOpen(comm, path, "r", &fp)); PetscCall(PetscSynchronizedFGets(comm, fp, char_array_len, line)); - PetscCall(PetscStrToArray(line, ' ', &ndims, &array)); - for (PetscInt i = 0; i < ndims; i++) dims[i] = atoi(array[i]); + { + char **array; + + PetscCall(PetscStrToArray(line, ' ', &ndims, &array)); + for (PetscInt i = 0; i < ndims; i++) dims[i] = atoi(array[i]); + PetscCall(PetscStrToArrayDestroy(ndims, array)); + } if (ndims < 2) dims[1] = 1; // Assume 1 column of data is not otherwise specified *nynodes = dims[0]; PetscCall(PetscMalloc1(*nynodes, &node_locs)); for (PetscInt i = 0; i < dims[0]; i++) { + char **array; + PetscCall(PetscSynchronizedFGets(comm, fp, char_array_len, line)); PetscCall(PetscStrToArray(line, ' ', &ndims, &array)); PetscCheck(ndims == dims[1], comm, PETSC_ERR_FILE_UNEXPECTED, "Line %" PetscInt_FMT " of %s does not contain correct number of columns (%d instead of %d)", i, path, ndims, dims[1]); node_locs[i] = (PetscReal)atof(array[0]); + PetscCall(PetscStrToArrayDestroy(ndims, array)); } PetscCall(PetscFClose(comm, fp)); *pynodes = node_locs; @@ -252,7 +261,7 @@ PetscErrorCode NS_BLASIUS(ProblemData problem, DM dm, void *ctx, SimpleBC bc) { CeedScalar T_inf = 288.; // K CeedScalar T_wall = 288.; // K CeedScalar delta0 = 4.2e-3; // m - CeedScalar P0 = 1.01e5; // Pa + CeedScalar P_inf = 1.01e5; // Pa PetscInt N = 20; // Number of Chebyshev terms PetscBool weakT = PETSC_FALSE; // weak density or temperature PetscReal mesh_refine_height = 5.9e-4; // m @@ -260,14 +269,19 @@ PetscErrorCode NS_BLASIUS(ProblemData problem, DM dm, void *ctx, SimpleBC bc) { PetscInt mesh_Ndelta = 45; // [-] PetscReal mesh_top_angle = 5; // degrees char mesh_ynodes_path[PETSC_MAX_PATH_LEN] = ""; + PetscBool P0_set; PetscOptionsBegin(comm, NULL, "Options for BLASIUS problem", NULL); PetscCall(PetscOptionsBool("-weakT", "Change from rho weak to T weak at inflow", NULL, weakT, &weakT, NULL)); PetscCall(PetscOptionsScalar("-velocity_infinity", "Velocity at boundary layer edge", NULL, U_inf, &U_inf, NULL)); PetscCall(PetscOptionsScalar("-temperature_infinity", "Temperature at boundary layer edge", NULL, T_inf, &T_inf, NULL)); + PetscCall(PetscOptionsHasName(NULL, NULL, "-P0", &P0_set)); // For maintaining behavior of -P0 flag (which is deprecated) + PetscCall( + PetscOptionsDeprecated("-P0", "-pressure_infinity", "libCEED 0.12.0", + "Use -pressure_infinity to set pressure at boundary layer edge and -idl_pressure to set the IDL reference pressure")); + PetscCall(PetscOptionsScalar("-pressure_infinity", "Pressure at boundary layer edge", NULL, P_inf, &P_inf, NULL)); PetscCall(PetscOptionsScalar("-temperature_wall", "Temperature at wall", NULL, T_wall, &T_wall, NULL)); PetscCall(PetscOptionsScalar("-delta0", "Boundary layer height at inflow", NULL, delta0, &delta0, NULL)); - PetscCall(PetscOptionsScalar("-P0", "Pressure at outflow", NULL, P0, &P0, NULL)); PetscCall(PetscOptionsInt("-n_chebyshev", "Number of Chebyshev terms", NULL, N, &N, NULL)); PetscCheck(3 <= N && N <= BLASIUS_MAX_N_CHEBYSHEV, comm, PETSC_ERR_ARG_OUTOFRANGE, "-n_chebyshev %" PetscInt_FMT " must be in range [3, %d]", N, BLASIUS_MAX_N_CHEBYSHEV); @@ -276,8 +290,8 @@ PetscErrorCode NS_BLASIUS(ProblemData problem, DM dm, void *ctx, SimpleBC bc) { PetscCall(PetscOptionsScalar("-platemesh_refine_height", "Height of boundary layer mesh refinement", NULL, mesh_refine_height, &mesh_refine_height, NULL)); PetscCall(PetscOptionsScalar("-platemesh_growth", "Geometric growth rate of boundary layer mesh", NULL, mesh_growth, &mesh_growth, NULL)); - PetscCall( - PetscOptionsScalar("-platemesh_top_angle", "Geometric top_angle rate of boundary layer mesh", NULL, mesh_top_angle, &mesh_top_angle, NULL)); + PetscCall(PetscOptionsScalar("-platemesh_top_angle", "Geometric top_angle rate of boundary layer mesh", NULL, mesh_top_angle, &mesh_top_angle, + NULL)); PetscCall(PetscOptionsString("-platemesh_y_node_locs_path", "Path to file with y node locations. " "If empty, will use the algorithmic mesh warping.", @@ -293,7 +307,7 @@ PetscErrorCode NS_BLASIUS(ProblemData problem, DM dm, void *ctx, SimpleBC bc) { T_inf *= Kelvin; T_wall *= Kelvin; - P0 *= Pascal; + P_inf *= Pascal; U_inf *= meter / second; delta0 *= meter; @@ -308,15 +322,19 @@ PetscErrorCode NS_BLASIUS(ProblemData problem, DM dm, void *ctx, SimpleBC bc) { // Some properties depend on parameters from NewtonianIdealGas PetscCallCeed(ceed, CeedQFunctionContextGetData(problem->apply_vol_rhs.qfunction_context, CEED_MEM_HOST, &newtonian_ig_ctx)); - blasius_ctx->weakT = weakT; - blasius_ctx->U_inf = U_inf; - blasius_ctx->T_inf = T_inf; - blasius_ctx->T_wall = T_wall; - blasius_ctx->delta0 = delta0; - blasius_ctx->P0 = P0; - blasius_ctx->n_cheb = N; - newtonian_ig_ctx->P0 = P0; - blasius_ctx->implicit = user->phys->implicit; + StatePrimitive Y_inf = { + .pressure = P_inf, .velocity = {U_inf, 0, 0}, + .temperature = T_inf + }; + State S_infty = StateFromPrimitive(newtonian_ig_ctx, Y_inf); + + blasius_ctx->weakT = weakT; + blasius_ctx->T_wall = T_wall; + blasius_ctx->delta0 = delta0; + blasius_ctx->S_infty = S_infty; + blasius_ctx->n_cheb = N; + blasius_ctx->implicit = user->phys->implicit; + if (P0_set) newtonian_ig_ctx->idl_pressure = P_inf; // For maintaining behavior of -P0 flag (which is deprecated) blasius_ctx->newtonian_ctx = *newtonian_ig_ctx; { @@ -338,10 +356,12 @@ PetscErrorCode NS_BLASIUS(ProblemData problem, DM dm, void *ctx, SimpleBC bc) { PetscCallCeed(ceed, CeedQFunctionContextDestroy(&problem->ics.qfunction_context)); problem->ics.qfunction_context = blasius_context; if (use_stg) { - PetscCall(SetupStg(comm, dm, problem, user, weakT, T_inf, P0)); + PetscCall(SetupStg(comm, dm, problem, user, weakT, S_infty.Y.temperature, S_infty.Y.pressure)); } else if (diff_filter_mms) { PetscCall(DifferentialFilterMmsICSetup(problem)); } else { + PetscCheck((user->phys->state_var == STATEVAR_CONSERVATIVE) || (user->app_ctx->test_type == TESTTYPE_DIFF_FILTER), user->comm, + PETSC_ERR_ARG_INCOMP, "Can only use conservative variables with Blasius and weak inflow"); problem->apply_inflow.qfunction = Blasius_Inflow; problem->apply_inflow.qfunction_loc = Blasius_Inflow_loc; problem->apply_inflow_jacobian.qfunction = Blasius_Inflow_Jacobian; diff --git a/examples/fluids/problems/channel.c b/examples/fluids/problems/channel.c index 8c0511114b..55734e042d 100644 --- a/examples/fluids/problems/channel.c +++ b/examples/fluids/problems/channel.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause diff --git a/examples/fluids/problems/densitycurrent.c b/examples/fluids/problems/densitycurrent.c index e49da42b5b..1dbbe36fb0 100644 --- a/examples/fluids/problems/densitycurrent.c +++ b/examples/fluids/problems/densitycurrent.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause diff --git a/examples/fluids/problems/eulervortex.c b/examples/fluids/problems/eulervortex.c index 0115ab5c83..34d74052ab 100644 --- a/examples/fluids/problems/eulervortex.c +++ b/examples/fluids/problems/eulervortex.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -14,7 +14,6 @@ #include #include "../navierstokes.h" -#include "../qfunctions/setupgeo.h" PetscErrorCode NS_EULER_VORTEX(ProblemData problem, DM dm, void *ctx, SimpleBC bc) { EulerTestType euler_test; @@ -33,12 +32,6 @@ PetscErrorCode NS_EULER_VORTEX(ProblemData problem, DM dm, void *ctx, SimpleBC b // SET UP DENSITY_CURRENT // ------------------------------------------------------ problem->dim = 3; - problem->q_data_size_vol = 10; - problem->q_data_size_sur = 10; - problem->setup_vol.qfunction = Setup; - problem->setup_vol.qfunction_loc = Setup_loc; - problem->setup_sur.qfunction = SetupBoundary; - problem->setup_sur.qfunction_loc = SetupBoundary_loc; problem->ics.qfunction = ICsEuler; problem->ics.qfunction_loc = ICsEuler_loc; problem->apply_vol_rhs.qfunction = Euler; @@ -49,7 +42,7 @@ PetscErrorCode NS_EULER_VORTEX(ProblemData problem, DM dm, void *ctx, SimpleBC b problem->apply_inflow.qfunction_loc = TravelingVortex_Inflow_loc; problem->apply_outflow.qfunction = Euler_Outflow; problem->apply_outflow.qfunction_loc = Euler_Outflow_loc; - problem->non_zero_time = PETSC_TRUE; + problem->compute_exact_solution_error = PETSC_TRUE; problem->print_info = PRINT_EULER_VORTEX; // ------------------------------------------------------ @@ -145,6 +138,7 @@ PetscErrorCode NS_EULER_VORTEX(ProblemData problem, DM dm, void *ctx, SimpleBC b PetscCallCeed(ceed, CeedQFunctionContextReferenceCopy(euler_context, &problem->apply_vol_ifunction.qfunction_context)); PetscCallCeed(ceed, CeedQFunctionContextReferenceCopy(euler_context, &problem->apply_inflow.qfunction_context)); PetscCallCeed(ceed, CeedQFunctionContextReferenceCopy(euler_context, &problem->apply_outflow.qfunction_context)); + PetscCallCeed(ceed, CeedQFunctionContextDestroy(&euler_context)); PetscFunctionReturn(PETSC_SUCCESS); } diff --git a/examples/fluids/problems/gaussianwave.c b/examples/fluids/problems/gaussianwave.c index 9af7924b78..abadc453f7 100644 --- a/examples/fluids/problems/gaussianwave.c +++ b/examples/fluids/problems/gaussianwave.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -37,6 +37,10 @@ PetscErrorCode NS_GAUSSIAN_WAVE(ProblemData problem, DM dm, void *ctx, SimpleBC problem->ics.qfunction = IC_GaussianWave_Prim; problem->ics.qfunction_loc = IC_GaussianWave_Prim_loc; break; + case STATEVAR_ENTROPY: + problem->ics.qfunction = IC_GaussianWave_Entropy; + problem->ics.qfunction_loc = IC_GaussianWave_Entropy_loc; + break; } // -- Option Defaults diff --git a/examples/fluids/problems/newtonian.c b/examples/fluids/problems/newtonian.c index 61c7ec5a2d..1ab6e222ac 100644 --- a/examples/fluids/problems/newtonian.c +++ b/examples/fluids/problems/newtonian.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -14,50 +14,140 @@ #include #include "../navierstokes.h" -#include "../qfunctions/setupgeo.h" // For use with PetscOptionsEnum -static const char *const StateVariables[] = {"CONSERVATIVE", "PRIMITIVE", "StateVariable", "STATEVAR_", NULL}; +static const char *const StateVariables[] = {"CONSERVATIVE", "PRIMITIVE", "ENTROPY", "StateVariable", "STATEVAR_", NULL}; -// Compute relative error |a - b|/|s| -static PetscErrorCode CheckPrimitiveWithTolerance(StatePrimitive sY, StatePrimitive aY, StatePrimitive bY, const char *name, PetscReal rtol_pressure, - PetscReal rtol_velocity, PetscReal rtol_temperature) { - StatePrimitive eY; // relative error +static PetscErrorCode CheckQWithTolerance(const CeedScalar Q_s[5], const CeedScalar Q_a[5], const CeedScalar Q_b[5], const char *name, + PetscReal rtol_0, PetscReal rtol_u, PetscReal rtol_4) { + CeedScalar relative_error[5]; // relative error + CeedScalar divisor_threshold = 10 * CEED_EPSILON; PetscFunctionBeginUser; - eY.pressure = (aY.pressure - bY.pressure) / sY.pressure; - PetscScalar u = sqrt(Square(sY.velocity[0]) + Square(sY.velocity[1]) + Square(sY.velocity[2])); - for (int j = 0; j < 3; j++) eY.velocity[j] = (aY.velocity[j] - bY.velocity[j]) / u; - eY.temperature = (aY.temperature - bY.temperature) / sY.temperature; - if (fabs(eY.pressure) > rtol_pressure) printf("%s: pressure error %g\n", name, eY.pressure); - for (int j = 0; j < 3; j++) { - if (fabs(eY.velocity[j]) > rtol_velocity) printf("%s: velocity[%d] error %g\n", name, j, eY.velocity[j]); + relative_error[0] = (Q_a[0] - Q_b[0]) / (fabs(Q_s[0]) > divisor_threshold ? Q_s[0] : 1); + relative_error[4] = (Q_a[4] - Q_b[4]) / (fabs(Q_s[4]) > divisor_threshold ? Q_s[4] : 1); + + CeedScalar u_magnitude = sqrt(Square(Q_s[1]) + Square(Q_s[2]) + Square(Q_s[3])); + CeedScalar u_divisor = u_magnitude > divisor_threshold ? u_magnitude : 1; + for (int i = 1; i < 4; i++) { + relative_error[i] = (Q_a[i] - Q_b[i]) / u_divisor; + } + + if (fabs(relative_error[0]) >= rtol_0) { + printf("%s[0] error %g (expected %.10e, got %.10e)\n", name, relative_error[0], Q_s[0], Q_a[0]); + } + for (int i = 1; i < 4; i++) { + if (fabs(relative_error[i]) >= rtol_u) { + printf("%s[%d] error %g (expected %.10e, got %.10e)\n", name, i, relative_error[i], Q_s[i], Q_a[i]); + } + } + if (fabs(relative_error[4]) >= rtol_4) { + printf("%s[4] error %g (expected %.10e, got %.10e)\n", name, relative_error[4], Q_s[4], Q_a[4]); } - if (fabs(eY.temperature) > rtol_temperature) printf("%s: temperature error %g\n", name, eY.temperature); PetscFunctionReturn(PETSC_SUCCESS); } +// @brief Verify `StateFromQ` by converting A0 -> B0 -> A0_test, where A0 should equal A0_test +static PetscErrorCode TestState(StateVariable state_var_A, StateVariable state_var_B, NewtonianIdealGasContext gas, const CeedScalar A0[5], + CeedScalar rtol_0, CeedScalar rtol_u, CeedScalar rtol_4) { + CeedScalar B0[5], A0_test[5]; + char buf[128]; + const char *const StateVariables_Initial[] = {"U", "Y", "V"}; + + PetscFunctionBeginUser; + const char *A_initial = StateVariables_Initial[state_var_A]; + const char *B_initial = StateVariables_Initial[state_var_B]; + + State state_A0 = StateFromQ(gas, A0, state_var_A); + StateToQ(gas, state_A0, B0, state_var_B); + State state_B0 = StateFromQ(gas, B0, state_var_B); + StateToQ(gas, state_B0, A0_test, state_var_A); + + snprintf(buf, sizeof buf, "%s->%s->%s: %s", A_initial, B_initial, A_initial, A_initial); + PetscCall(CheckQWithTolerance(A0, A0_test, A0, buf, rtol_0, rtol_u, rtol_4)); + PetscFunctionReturn(PETSC_SUCCESS); +} + +// @brief Verify `StateFromQ_fwd` via a finite difference approximation +static PetscErrorCode TestState_fwd(StateVariable state_var_A, StateVariable state_var_B, NewtonianIdealGasContext gas, const CeedScalar A0[5], + CeedScalar rtol_0, CeedScalar rtol_u, CeedScalar rtol_4) { + CeedScalar eps = 4e-7; // Finite difference step + char buf[128]; + const char *const StateVariables_Initial[] = {"U", "Y", "V"}; + + PetscFunctionBeginUser; + const char *A_initial = StateVariables_Initial[state_var_A]; + const char *B_initial = StateVariables_Initial[state_var_B]; + State state_0 = StateFromQ(gas, A0, state_var_A); + + for (int i = 0; i < 5; i++) { + CeedScalar dB[5] = {0.}, dB_fd[5] = {0.}; + { // Calculate dB using State functions + CeedScalar dA[5] = {0}; + + dA[i] = A0[i]; + State dstate_0 = StateFromQ_fwd(gas, state_0, dA, state_var_A); + StateToQ_fwd(gas, state_0, dstate_0, dB, state_var_B); + } + + { // Calculate dB_fd via finite difference approximation + CeedScalar A1[5], B0[5], B1[5]; + + for (int j = 0; j < 5; j++) A1[j] = (1 + eps * (i == j)) * A0[j]; + State state_1 = StateFromQ(gas, A1, state_var_A); + StateToQ(gas, state_0, B0, state_var_B); + StateToQ(gas, state_1, B1, state_var_B); + for (int j = 0; j < 5; j++) dB_fd[j] = (B1[j] - B0[j]) / eps; + } + + snprintf(buf, sizeof buf, "d%s->d%s: StateFrom%s_fwd i=%d: d%s", A_initial, B_initial, A_initial, i, B_initial); + PetscCall(CheckQWithTolerance(dB_fd, dB, dB_fd, buf, rtol_0, rtol_u, rtol_4)); + } + PetscFunctionReturn(PETSC_SUCCESS); +} + +// @brief Test the Newtonian State transformation functions, `StateFrom*` static PetscErrorCode UnitTests_Newtonian(User user, NewtonianIdealGasContext gas) { Units units = user->units; - const CeedScalar eps = 1e-6; - const CeedScalar kg = units->kilogram, m = units->meter, sec = units->second, Pascal = units->Pascal; + const CeedScalar kg = units->kilogram, m = units->meter, sec = units->second, K = units->Kelvin; + PetscFunctionBeginUser; - const CeedScalar rho = 1.2 * kg / (m * m * m), u = 40 * m / sec; - CeedScalar U[5] = {rho, rho * u, rho * u * 1.1, rho * u * 1.2, 250e3 * Pascal + .5 * rho * u * u}; - State s = StateFromU(gas, U); - for (int i = 0; i < 8; i++) { - CeedScalar dU[5] = {0}; - if (i < 5) dU[i] = U[i]; - State ds = StateFromU_fwd(gas, s, dU); - for (int j = 0; j < 5; j++) dU[j] = (1 + eps * (i == j)) * U[j]; - State t = StateFromU(gas, dU); - StatePrimitive dY; - dY.pressure = (t.Y.pressure - s.Y.pressure) / eps; - for (int j = 0; j < 3; j++) dY.velocity[j] = (t.Y.velocity[j] - s.Y.velocity[j]) / eps; - dY.temperature = (t.Y.temperature - s.Y.temperature) / eps; - char buf[128]; - snprintf(buf, sizeof buf, "StateFromU_fwd i=%d", i); - PetscCall(CheckPrimitiveWithTolerance(dY, ds.Y, dY, buf, 5e-6, 1e-6, 1e-6)); + const CeedScalar T = 200 * K; + const CeedScalar rho = 1.2 * kg / Cube(m); + const CeedScalar P = (HeatCapacityRatio(gas) - 1) * rho * gas->cv * T; + const CeedScalar u_base = 40 * m / sec; + const CeedScalar u[3] = {u_base, u_base * 1.1, u_base * 1.2}; + const CeedScalar e_kinetic = 0.5 * Dot3(u, u); + const CeedScalar e_internal = gas->cv * T; + const CeedScalar e_total = e_kinetic + e_internal; + const CeedScalar gamma = HeatCapacityRatio(gas); + const CeedScalar entropy = log(P) - gamma * log(rho); + const CeedScalar rho_div_p = rho / P; + const CeedScalar Y0[5] = {P, u[0], u[1], u[2], T}; + const CeedScalar U0[5] = {rho, rho * u[0], rho * u[1], rho * u[2], rho * e_total}; + const CeedScalar V0[5] = {(gamma - entropy) / (gamma - 1) - rho_div_p * (e_kinetic), rho_div_p * u[0], rho_div_p * u[1], rho_div_p * u[2], + -rho_div_p}; + + { + CeedScalar rtol = 40 * CEED_EPSILON; + + PetscCall(TestState(STATEVAR_PRIMITIVE, STATEVAR_CONSERVATIVE, gas, Y0, rtol, rtol, rtol)); + PetscCall(TestState(STATEVAR_PRIMITIVE, STATEVAR_ENTROPY, gas, Y0, rtol, rtol, rtol)); + PetscCall(TestState(STATEVAR_CONSERVATIVE, STATEVAR_PRIMITIVE, gas, U0, rtol, rtol, rtol)); + PetscCall(TestState(STATEVAR_CONSERVATIVE, STATEVAR_ENTROPY, gas, U0, rtol, rtol, rtol)); + PetscCall(TestState(STATEVAR_ENTROPY, STATEVAR_CONSERVATIVE, gas, V0, rtol, rtol, rtol)); + PetscCall(TestState(STATEVAR_ENTROPY, STATEVAR_PRIMITIVE, gas, V0, rtol, rtol, rtol)); + } + + { + CeedScalar rtol = 5e-6; + + PetscCall(TestState_fwd(STATEVAR_PRIMITIVE, STATEVAR_CONSERVATIVE, gas, Y0, rtol, rtol, rtol)); + PetscCall(TestState_fwd(STATEVAR_PRIMITIVE, STATEVAR_ENTROPY, gas, Y0, rtol, rtol, rtol)); + PetscCall(TestState_fwd(STATEVAR_CONSERVATIVE, STATEVAR_PRIMITIVE, gas, U0, rtol, rtol, rtol)); + PetscCall(TestState_fwd(STATEVAR_CONSERVATIVE, STATEVAR_ENTROPY, gas, U0, 10 * rtol, rtol, rtol)); + PetscCall(TestState_fwd(STATEVAR_ENTROPY, STATEVAR_CONSERVATIVE, gas, V0, 5 * rtol, rtol, rtol)); + PetscCall(TestState_fwd(STATEVAR_ENTROPY, STATEVAR_PRIMITIVE, gas, V0, 5 * rtol, 5 * rtol, 5 * rtol)); } PetscFunctionReturn(PETSC_SUCCESS); } @@ -81,14 +171,12 @@ PetscErrorCode CreateKSPMassOperator_NewtonianStabilized(User user, CeedOperator CeedOperatorField field; PetscInt sub_op_index = 0; // will be 0 for the volume op - PetscCallCeed(ceed, CeedCompositeOperatorGetSubList(user->op_rhs_ctx->op, &sub_ops)); + PetscCallCeed(ceed, CeedOperatorCompositeGetSubList(user->op_rhs_ctx->op, &sub_ops)); PetscCallCeed(ceed, CeedOperatorGetFieldByName(sub_ops[sub_op_index], "q", &field)); - PetscCallCeed(ceed, CeedOperatorFieldGetElemRestriction(field, &elem_restr_q)); - PetscCallCeed(ceed, CeedOperatorFieldGetBasis(field, &basis_q)); + PetscCallCeed(ceed, CeedOperatorFieldGetData(field, NULL, &elem_restr_q, &basis_q, NULL)); PetscCallCeed(ceed, CeedOperatorGetFieldByName(sub_ops[sub_op_index], "qdata", &field)); - PetscCallCeed(ceed, CeedOperatorFieldGetElemRestriction(field, &elem_restr_qd_i)); - PetscCallCeed(ceed, CeedOperatorFieldGetVector(field, &q_data)); + PetscCallCeed(ceed, CeedOperatorFieldGetData(field, NULL, &elem_restr_qd_i, NULL, &q_data)); PetscCallCeed(ceed, CeedOperatorGetContext(sub_ops[sub_op_index], &qf_ctx)); } @@ -113,9 +201,15 @@ PetscErrorCode CreateKSPMassOperator_NewtonianStabilized(User user, CeedOperator PetscCallCeed(ceed, CeedOperatorSetField(*op_mass, "v", elem_restr_q, basis_q, CEED_VECTOR_ACTIVE)); PetscCallCeed(ceed, CeedOperatorSetField(*op_mass, "Grad_v", elem_restr_q, basis_q, CEED_VECTOR_ACTIVE)); + PetscCallCeed(ceed, CeedVectorDestroy(&q_data)); + PetscCallCeed(ceed, CeedElemRestrictionDestroy(&elem_restr_q)); + PetscCallCeed(ceed, CeedElemRestrictionDestroy(&elem_restr_qd_i)); + PetscCallCeed(ceed, CeedBasisDestroy(&basis_q)); + PetscCallCeed(ceed, CeedQFunctionContextDestroy(&qf_ctx)); PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_mass)); PetscFunctionReturn(PETSC_SUCCESS); } + PetscErrorCode NS_NEWTONIAN_IG(ProblemData problem, DM dm, void *ctx, SimpleBC bc) { SetupContext setup_context; User user = *(User *)ctx; @@ -136,17 +230,11 @@ PetscErrorCode NS_NEWTONIAN_IG(ProblemData problem, DM dm, void *ctx, SimpleBC b // ------------------------------------------------------ // Setup Generic Newtonian IG Problem // ------------------------------------------------------ - problem->dim = 3; - problem->q_data_size_vol = 10; - problem->q_data_size_sur = 10; - problem->jac_data_size_sur = 11; - problem->setup_vol.qfunction = Setup; - problem->setup_vol.qfunction_loc = Setup_loc; - problem->setup_sur.qfunction = SetupBoundary; - problem->setup_sur.qfunction_loc = SetupBoundary_loc; - problem->non_zero_time = PETSC_FALSE; - problem->print_info = PRINT_NEWTONIAN; - problem->uses_newtonian = PETSC_TRUE; + problem->dim = 3; + problem->jac_data_size_sur = 11; + problem->compute_exact_solution_error = PETSC_FALSE; + problem->print_info = PRINT_NEWTONIAN; + problem->uses_newtonian = PETSC_TRUE; // ------------------------------------------------------ // Create the libCEED context @@ -169,7 +257,7 @@ PetscErrorCode NS_NEWTONIAN_IG(ProblemData problem, DM dm, void *ctx, SimpleBC b for (PetscInt i = 0; i < 3; i++) domain_size[i] = domain_max[i] - domain_min[i]; StatePrimitive reference = {.pressure = 1.01e5, .velocity = {0}, .temperature = 288.15}; - CeedScalar idl_decay_time = -1, idl_start = 0, idl_length = 0; + CeedScalar idl_decay_time = -1, idl_start = 0, idl_length = 0, idl_pressure = reference.pressure; PetscBool idl_enable = PETSC_FALSE; // ------------------------------------------------------ @@ -205,7 +293,6 @@ PetscErrorCode NS_NEWTONIAN_IG(ProblemData problem, DM dm, void *ctx, SimpleBC b problem->apply_inflow_jacobian.qfunction = BoundaryIntegral_Jacobian_Conserv; problem->apply_inflow_jacobian.qfunction_loc = BoundaryIntegral_Jacobian_Conserv_loc; break; - case STATEVAR_PRIMITIVE: problem->ics.qfunction = ICsNewtonianIG_Prim; problem->ics.qfunction_loc = ICsNewtonianIG_Prim_loc; @@ -218,6 +305,18 @@ PetscErrorCode NS_NEWTONIAN_IG(ProblemData problem, DM dm, void *ctx, SimpleBC b problem->apply_inflow_jacobian.qfunction = BoundaryIntegral_Jacobian_Prim; problem->apply_inflow_jacobian.qfunction_loc = BoundaryIntegral_Jacobian_Prim_loc; break; + case STATEVAR_ENTROPY: + problem->ics.qfunction = ICsNewtonianIG_Entropy; + problem->ics.qfunction_loc = ICsNewtonianIG_Entropy_loc; + problem->apply_vol_ifunction.qfunction = IFunction_Newtonian_Entropy; + problem->apply_vol_ifunction.qfunction_loc = IFunction_Newtonian_Entropy_loc; + problem->apply_vol_ijacobian.qfunction = IJacobian_Newtonian_Entropy; + problem->apply_vol_ijacobian.qfunction_loc = IJacobian_Newtonian_Entropy_loc; + problem->apply_inflow.qfunction = BoundaryIntegral_Entropy; + problem->apply_inflow.qfunction_loc = BoundaryIntegral_Entropy_loc; + problem->apply_inflow_jacobian.qfunction = BoundaryIntegral_Jacobian_Entropy; + problem->apply_inflow_jacobian.qfunction_loc = BoundaryIntegral_Jacobian_Entropy_loc; + break; } // -- Physics @@ -229,8 +328,6 @@ PetscErrorCode NS_NEWTONIAN_IG(ProblemData problem, DM dm, void *ctx, SimpleBC b PetscInt dim = problem->dim; PetscCall(PetscOptionsDeprecated("-g", "-gravity", "libCEED 0.11.1", NULL)); - PetscCall(PetscOptionsRealArray("-g", "Gravitational acceleration vector", NULL, g, &dim, &given_option)); - dim = problem->dim; PetscCall(PetscOptionsRealArray("-gravity", "Gravitational acceleration vector", NULL, g, &dim, &given_option)); if (given_option) PetscCheck(dim == 3, comm, PETSC_ERR_ARG_SIZ, "Gravity vector must be size 3, %" PetscInt_FMT " values given", dim); @@ -269,6 +366,9 @@ PetscErrorCode NS_NEWTONIAN_IG(ProblemData problem, DM dm, void *ctx, SimpleBC b if (idl_decay_time < 0) idl_enable = PETSC_FALSE; PetscCall(PetscOptionsScalar("-idl_start", "Start of IDL in the x direction", NULL, idl_start, &idl_start, NULL)); PetscCall(PetscOptionsScalar("-idl_length", "Length of IDL in the positive x direction", NULL, idl_length, &idl_length, NULL)); + idl_pressure = reference.pressure; + PetscCall(PetscOptionsScalar("-idl_pressure", "Pressure IDL uses as reference (default is `-reference_pressure`)", NULL, idl_pressure, + &idl_pressure, NULL)); PetscOptionsEnd(); if (stab == STAB_SUPG && !implicit) problem->create_mass_operator = CreateKSPMassOperator_NewtonianStabilized; @@ -322,15 +422,14 @@ PetscErrorCode NS_NEWTONIAN_IG(ProblemData problem, DM dm, void *ctx, SimpleBC b newtonian_ig_ctx->Ctau_C = Ctau_C; newtonian_ig_ctx->Ctau_M = Ctau_M; newtonian_ig_ctx->Ctau_E = Ctau_E; - newtonian_ig_ctx->P0 = reference.pressure; newtonian_ig_ctx->stabilization = stab; - newtonian_ig_ctx->P0 = reference.pressure; newtonian_ig_ctx->is_implicit = implicit; newtonian_ig_ctx->state_var = state_var; newtonian_ig_ctx->idl_enable = idl_enable; newtonian_ig_ctx->idl_amplitude = 1 / (idl_decay_time * second); newtonian_ig_ctx->idl_start = idl_start * meter; newtonian_ig_ctx->idl_length = idl_length * meter; + newtonian_ig_ctx->idl_pressure = idl_pressure; PetscCall(PetscArraycpy(newtonian_ig_ctx->g, g, 3)); // -- Setup Context diff --git a/examples/fluids/problems/sgs_dd_model.c b/examples/fluids/problems/sgs_dd_model.c deleted file mode 100644 index 3f5f3cddb6..0000000000 --- a/examples/fluids/problems/sgs_dd_model.c +++ /dev/null @@ -1,594 +0,0 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. -// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. -// -// SPDX-License-Identifier: BSD-2-Clause -// -// This file is part of CEED: http://github.com/ceed - -#include "../qfunctions/sgs_dd_model.h" - -#include - -#include "../navierstokes.h" - -typedef struct { - CeedElemRestriction elem_restr_grid_aniso, elem_restr_sgs; - CeedVector grid_aniso_ceed; - CeedQFunctionContext sgsdd_qfctx, ifunction_qfctx; -} *SgsDDSetupData; - -PetscErrorCode SgsDDSetupDataDestroy(SgsDDSetupData sgs_dd_setup_data) { - Ceed ceed; - - PetscFunctionBeginUser; - PetscCall(CeedElemRestrictionGetCeed(sgs_dd_setup_data->elem_restr_sgs, &ceed)); - - PetscCallCeed(ceed, CeedElemRestrictionDestroy(&sgs_dd_setup_data->elem_restr_grid_aniso)); - PetscCallCeed(ceed, CeedElemRestrictionDestroy(&sgs_dd_setup_data->elem_restr_sgs)); - PetscCallCeed(ceed, CeedVectorDestroy(&sgs_dd_setup_data->grid_aniso_ceed)); - PetscCallCeed(ceed, CeedQFunctionContextDestroy(&sgs_dd_setup_data->sgsdd_qfctx)); - PetscCallCeed(ceed, CeedQFunctionContextDestroy(&sgs_dd_setup_data->ifunction_qfctx)); - PetscCall(PetscFree(sgs_dd_setup_data)); - PetscFunctionReturn(PETSC_SUCCESS); -} - -// @brief Create DM for storing subgrid stress at nodes -static PetscErrorCode SgsDDCreateDM(DM dm_source, DM *dm_sgs, PetscInt degree, PetscInt q_extra, PetscInt *num_components) { - PetscSection section; - - PetscFunctionBeginUser; - *num_components = 6; - - PetscCall(DMClone(dm_source, dm_sgs)); - PetscCall(PetscObjectSetName((PetscObject)*dm_sgs, "Subgrid Stress Projection")); - - PetscCall(DMSetupByOrder_FEM(PETSC_TRUE, PETSC_TRUE, degree, 1, q_extra, 1, num_components, *dm_sgs)); - - PetscCall(DMGetLocalSection(*dm_sgs, §ion)); - PetscCall(PetscSectionSetFieldName(section, 0, "")); - PetscCall(PetscSectionSetComponentName(section, 0, 0, "KMSubgridStressXX")); - PetscCall(PetscSectionSetComponentName(section, 0, 1, "KMSubgridStressYY")); - PetscCall(PetscSectionSetComponentName(section, 0, 2, "KMSubgridStressZZ")); - PetscCall(PetscSectionSetComponentName(section, 0, 3, "KMSubgridStressYZ")); - PetscCall(PetscSectionSetComponentName(section, 0, 4, "KMSubgridStressXZ")); - PetscCall(PetscSectionSetComponentName(section, 0, 5, "KMSubgridStressXY")); - PetscFunctionReturn(PETSC_SUCCESS); -}; - -// @brief Evaluate data-driven SGS using fused method -static PetscErrorCode SgsDDNodalStressEval_Fused(User user, Vec Q_loc, Vec VelocityGradient, Vec SGSNodal_loc) { - SgsDDData sgs_dd_data = user->sgs_dd_data; - PetscMemType q_mem_type; - - PetscFunctionBeginUser; - PetscCall(VecPetscToCeed(Q_loc, &q_mem_type, user->q_ceed)); // q_ceed is an implicit input - - PetscCall(ApplyCeedOperatorGlobalToLocal(VelocityGradient, SGSNodal_loc, sgs_dd_data->op_nodal_evaluation_ctx)); - - PetscCall(VecCeedToPetsc(user->q_ceed, q_mem_type, Q_loc)); - PetscFunctionReturn(PETSC_SUCCESS); -} - -// @brief Create CeedOperator to calculate data-drive SGS at nodes using fused operator -static PetscErrorCode SgsDDSetupNodalEvaluation_Fused(Ceed ceed, User user, CeedData ceed_data, SgsDDSetupData sgs_dd_setup_data) { - SgsDDData sgs_dd_data = user->sgs_dd_data; - CeedQFunction qf_sgs_dd_nodal; - CeedOperator op_sgs_dd_nodal; - CeedInt num_comp_q, num_comp_grad_velo, num_comp_x, num_comp_grid_aniso; - PetscInt dim; - CeedVector inv_multiplicity; - CeedElemRestriction elem_restr_inv_multiplicity, elem_restr_grad_velo, elem_restr_sgs; - DMLabel domain_label = NULL; - PetscInt label_value = 0, height = 0, dm_field = 0; - - PetscFunctionBeginUser; - PetscCall(DMGetDimension(user->dm, &dim)); - PetscCallCeed(ceed, CeedElemRestrictionGetNumComponents(ceed_data->elem_restr_x, &num_comp_x)); - PetscCallCeed(ceed, CeedElemRestrictionGetNumComponents(ceed_data->elem_restr_q, &num_comp_q)); - PetscCallCeed(ceed, CeedElemRestrictionGetNumComponents(sgs_dd_setup_data->elem_restr_grid_aniso, &num_comp_grid_aniso)); - - { // Get velocity gradient information - CeedOperatorField op_field; - PetscCallCeed(ceed, CeedOperatorGetFieldByName(user->grad_velo_proj->l2_rhs_ctx->op, "velocity gradient", &op_field)); - PetscCallCeed(ceed, CeedOperatorFieldGetElemRestriction(op_field, &elem_restr_grad_velo)); - PetscCallCeed(ceed, CeedElemRestrictionGetNumComponents(elem_restr_grad_velo, &num_comp_grad_velo)); - } - PetscCall(DMPlexCeedElemRestrictionCreate(ceed, sgs_dd_data->dm_sgs, domain_label, label_value, height, dm_field, &elem_restr_sgs)); - PetscCallCeed(ceed, CeedElemRestrictionCreateVector(elem_restr_sgs, &sgs_dd_data->sgs_nodal_ceed, NULL)); - - PetscCall(GetInverseMultiplicity(ceed, sgs_dd_data->dm_sgs, domain_label, label_value, height, dm_field, PETSC_FALSE, &elem_restr_inv_multiplicity, - &inv_multiplicity)); - - // -- Create operator for SGS DD model nodal evaluation - switch (user->phys->state_var) { - case STATEVAR_PRIMITIVE: - PetscCallCeed(ceed, CeedQFunctionCreateInterior(ceed, 1, ComputeSgsDDNodal_Prim, ComputeSgsDDNodal_Prim_loc, &qf_sgs_dd_nodal)); - break; - case STATEVAR_CONSERVATIVE: - PetscCallCeed(ceed, CeedQFunctionCreateInterior(ceed, 1, ComputeSgsDDNodal_Conserv, ComputeSgsDDNodal_Conserv_loc, &qf_sgs_dd_nodal)); - break; - default: - SETERRQ(PetscObjectComm((PetscObject)user->dm), PETSC_ERR_SUP, "Data-driven SGS nodal evaluation not available for chosen state variable"); - } - - // Mesh/geometry order and solution basis order may differ, therefore must interpolate - CeedBasis basis_x_to_q; - PetscCallCeed(ceed, CeedBasisCreateProjection(ceed_data->basis_x, ceed_data->basis_q, &basis_x_to_q)); - - PetscCallCeed(ceed, CeedQFunctionSetContext(qf_sgs_dd_nodal, sgs_dd_setup_data->sgsdd_qfctx)); - PetscCallCeed(ceed, CeedQFunctionAddInput(qf_sgs_dd_nodal, "q", num_comp_q, CEED_EVAL_NONE)); - PetscCallCeed(ceed, CeedQFunctionAddInput(qf_sgs_dd_nodal, "x", num_comp_x, CEED_EVAL_INTERP)); - PetscCallCeed(ceed, CeedQFunctionAddInput(qf_sgs_dd_nodal, "gradient velocity", num_comp_grad_velo, CEED_EVAL_NONE)); - PetscCallCeed(ceed, CeedQFunctionAddInput(qf_sgs_dd_nodal, "anisotropy tensor", num_comp_grid_aniso, CEED_EVAL_NONE)); - PetscCallCeed(ceed, CeedQFunctionAddInput(qf_sgs_dd_nodal, "inverse multiplicity", 1, CEED_EVAL_NONE)); - PetscCallCeed(ceed, CeedQFunctionAddOutput(qf_sgs_dd_nodal, "km_sgs", sgs_dd_data->num_comp_sgs, CEED_EVAL_NONE)); - - PetscCallCeed(ceed, CeedOperatorCreate(ceed, qf_sgs_dd_nodal, NULL, NULL, &op_sgs_dd_nodal)); - PetscCallCeed(ceed, CeedOperatorSetField(op_sgs_dd_nodal, "q", ceed_data->elem_restr_q, CEED_BASIS_NONE, user->q_ceed)); - PetscCallCeed(ceed, CeedOperatorSetField(op_sgs_dd_nodal, "x", ceed_data->elem_restr_x, basis_x_to_q, ceed_data->x_coord)); - PetscCallCeed(ceed, CeedOperatorSetField(op_sgs_dd_nodal, "gradient velocity", elem_restr_grad_velo, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE)); - PetscCallCeed(ceed, CeedOperatorSetField(op_sgs_dd_nodal, "anisotropy tensor", sgs_dd_setup_data->elem_restr_grid_aniso, CEED_BASIS_NONE, - sgs_dd_setup_data->grid_aniso_ceed)); - PetscCallCeed(ceed, CeedOperatorSetField(op_sgs_dd_nodal, "inverse multiplicity", elem_restr_inv_multiplicity, CEED_BASIS_NONE, inv_multiplicity)); - PetscCallCeed(ceed, CeedOperatorSetField(op_sgs_dd_nodal, "km_sgs", elem_restr_sgs, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE)); - - PetscCall(OperatorApplyContextCreate(user->grad_velo_proj->dm, sgs_dd_data->dm_sgs, ceed, op_sgs_dd_nodal, NULL, sgs_dd_data->sgs_nodal_ceed, NULL, - NULL, &sgs_dd_data->op_nodal_evaluation_ctx)); - - sgs_dd_setup_data->elem_restr_sgs = elem_restr_sgs; - sgs_dd_data->sgs_nodal_eval = SgsDDNodalStressEval_Fused; - - PetscCallCeed(ceed, CeedVectorDestroy(&inv_multiplicity)); - PetscCallCeed(ceed, CeedBasisDestroy(&basis_x_to_q)); - PetscCallCeed(ceed, CeedElemRestrictionDestroy(&elem_restr_inv_multiplicity)); - PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_sgs_dd_nodal)); - PetscCallCeed(ceed, CeedOperatorDestroy(&op_sgs_dd_nodal)); - PetscFunctionReturn(PETSC_SUCCESS); -} - -// @brief Setup data-driven model inference using internal (libCEED native) implementation -static PetscErrorCode SgsDDSetupNodalEvaluation_Sequential_Internal(Ceed ceed, SgsDDData sgs_dd_data, SgsDDSetupData sgs_dd_setup_data, - CeedElemRestriction elem_restr_dd_inputs, - CeedElemRestriction elem_restr_dd_outputs, - CeedElemRestriction elem_restr_inv_multiplicity, CeedVector inv_multiplicity, - void **ctx) { - CeedQFunction qf_sgs_dd_inference; - CeedOperator op_sgs_dd_inference; - OperatorApplyContext *op_context = (OperatorApplyContext *)ctx; - - PetscFunctionBeginUser; - PetscCallCeed(ceed, CeedQFunctionCreateInterior(ceed, 1, ComputeSgsDDNodal_Sequential_Inference, ComputeSgsDDNodal_Sequential_Inference_loc, - &qf_sgs_dd_inference)); - - PetscCallCeed(ceed, CeedQFunctionSetContext(qf_sgs_dd_inference, sgs_dd_setup_data->sgsdd_qfctx)); - PetscCallCeed(ceed, CeedQFunctionAddInput(qf_sgs_dd_inference, "model inputs", sgs_dd_data->num_comp_inputs, CEED_EVAL_NONE)); - PetscCallCeed(ceed, CeedQFunctionAddInput(qf_sgs_dd_inference, "inverse multiplicity", 1, CEED_EVAL_NONE)); - PetscCallCeed(ceed, CeedQFunctionAddOutput(qf_sgs_dd_inference, "model outputs", sgs_dd_data->num_comp_outputs, CEED_EVAL_NONE)); - - PetscCallCeed(ceed, CeedOperatorCreate(ceed, qf_sgs_dd_inference, NULL, NULL, &op_sgs_dd_inference)); - PetscCallCeed(ceed, CeedOperatorSetField(op_sgs_dd_inference, "model inputs", elem_restr_dd_inputs, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE)); - PetscCallCeed(ceed, - CeedOperatorSetField(op_sgs_dd_inference, "inverse multiplicity", elem_restr_inv_multiplicity, CEED_BASIS_NONE, inv_multiplicity)); - PetscCallCeed(ceed, CeedOperatorSetField(op_sgs_dd_inference, "model outputs", elem_restr_dd_outputs, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE)); - - PetscCall(OperatorApplyContextCreate(sgs_dd_data->dm_dd_inputs, sgs_dd_data->dm_dd_outputs, ceed, op_sgs_dd_inference, NULL, NULL, NULL, NULL, - op_context)); - sgs_dd_data->sgs_nodal_inference_ctx_destroy = (PetscErrorCode(*)(void *))OperatorApplyContextDestroy; - - PetscCallCeed(ceed, CeedOperatorDestroy(&op_sgs_dd_inference)); - PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_sgs_dd_inference)); - PetscFunctionReturn(PETSC_SUCCESS); -} - -// @brief Perform data-driven model inference using internal (libCEED native) implementation -PetscErrorCode SgsDDNodalStressEval_Sequential_Internal(Vec DD_Inputs_loc, Vec DD_Outputs_loc, void *ctx) { - OperatorApplyContext op_context = *(OperatorApplyContext *)ctx; - - PetscFunctionBeginUser; - PetscCall(ApplyCeedOperatorLocalToLocal(DD_Inputs_loc, DD_Outputs_loc, op_context)); - PetscFunctionReturn(PETSC_SUCCESS); -} - -// @brief Evaluate data-driven SGS using sequential method -PetscErrorCode SgsDDNodalStressEval_Sequential(User user, Vec Q_loc, Vec VelocityGradient, Vec SGSNodal_loc) { - SgsDDData sgs_dd_data = user->sgs_dd_data; - PetscMemType q_mem_type; - Vec DD_Inputs_loc, DD_Outputs_loc; - - PetscFunctionBeginUser; - PetscCall(DMGetLocalVector(sgs_dd_data->dm_dd_inputs, &DD_Inputs_loc)); - PetscCall(DMGetLocalVector(sgs_dd_data->dm_dd_outputs, &DD_Outputs_loc)); - PetscCall(VecPetscToCeed(Q_loc, &q_mem_type, user->q_ceed)); // q_ceed is an implicit input - - PetscCall(ApplyCeedOperatorGlobalToLocal(VelocityGradient, DD_Inputs_loc, sgs_dd_data->op_nodal_dd_inputs_ctx)); - PetscCall(sgs_dd_data->sgs_nodal_inference(DD_Inputs_loc, DD_Outputs_loc, &sgs_dd_data->sgs_nodal_inference_ctx)); - PetscCall(ApplyCeedOperatorLocalToLocal(DD_Outputs_loc, SGSNodal_loc, sgs_dd_data->op_nodal_dd_outputs_ctx)); - - PetscCall(VecCeedToPetsc(user->q_ceed, q_mem_type, Q_loc)); - PetscCall(DMRestoreLocalVector(sgs_dd_data->dm_dd_inputs, &DD_Inputs_loc)); - PetscCall(DMRestoreLocalVector(sgs_dd_data->dm_dd_outputs, &DD_Outputs_loc)); - PetscFunctionReturn(PETSC_SUCCESS); -} - -// @brief Create CeedOperator to calculate data-drive SGS at nodes using sequentially-applied operators -static PetscErrorCode SgsDDSetupNodalEvaluation_Sequential(Ceed ceed, User user, CeedData ceed_data, SgsDDSetupData sgs_dd_setup_data) { - SgsDDData sgs_dd_data = user->sgs_dd_data; - CeedInt num_comp_q, num_comp_grad_velo, num_comp_x, num_comp_grid_aniso, num_comp_eigvec = 9 + 1; - PetscInt dim; - CeedVector inv_multiplicity, eigvec; - CeedElemRestriction elem_restr_inv_multiplicity, elem_restr_grad_velo, elem_restr_sgs, elem_restr_eigvec, elem_restr_dd_inputs, - elem_restr_dd_outputs; - DMLabel domain_label = NULL; - PetscInt label_value = 0, height = 0, dm_field = 0; - - PetscFunctionBeginUser; - { // Create DMs for data-driven input and output values - PetscSection section; - PetscInt degree, q_extra; - { // Get degree and number of quadrature points from dm_sgs - PetscFE fe; - PetscSpace basis; - PetscQuadrature quadrature; - PetscInt num_qpnts; - PetscCall(DMGetField(sgs_dd_data->dm_sgs, 0, NULL, (PetscObject *)&fe)); - PetscCall(PetscFEGetBasisSpace(fe, &basis)); - PetscCall(PetscSpaceGetDegree(basis, °ree, NULL)); - PetscCall(PetscFEGetQuadrature(fe, &quadrature)); - PetscCall(PetscQuadratureGetOrder(quadrature, &num_qpnts)); - q_extra = degree - num_qpnts; - } - - PetscCall(DMClone(sgs_dd_data->dm_sgs, &sgs_dd_data->dm_dd_inputs)); - PetscCall(PetscObjectSetName((PetscObject)sgs_dd_data->dm_dd_inputs, "Data-Driven Model Inputs")); - PetscCall(DMSetupByOrder_FEM(PETSC_TRUE, PETSC_TRUE, degree, 1, q_extra, 1, &sgs_dd_data->num_comp_inputs, sgs_dd_data->dm_dd_inputs)); - PetscCall(DMGetLocalSection(sgs_dd_data->dm_dd_inputs, §ion)); - PetscCall(PetscSectionSetFieldName(section, 0, "")); - for (CeedInt i = 0; i < sgs_dd_data->num_comp_inputs; i++) { - char component_name[PETSC_MAX_PATH_LEN]; - - PetscCall(PetscSNPrintf(component_name, sizeof component_name, "DataDrivenInput%" CeedInt_FMT, i + 1)); - PetscCall(PetscSectionSetComponentName(section, 0, i, component_name)); - } - - PetscCall(DMClone(sgs_dd_data->dm_sgs, &sgs_dd_data->dm_dd_outputs)); - PetscCall(PetscObjectSetName((PetscObject)sgs_dd_data->dm_dd_outputs, "Data-Driven Model Outputs")); - PetscCall(DMSetupByOrder_FEM(PETSC_TRUE, PETSC_TRUE, degree, 1, q_extra, 1, &sgs_dd_data->num_comp_outputs, sgs_dd_data->dm_dd_outputs)); - PetscCall(DMGetLocalSection(sgs_dd_data->dm_dd_outputs, §ion)); - PetscCall(PetscSectionSetFieldName(section, 0, "")); - for (CeedInt i = 0; i < sgs_dd_data->num_comp_outputs; i++) { - char component_name[PETSC_MAX_PATH_LEN]; - - PetscCall(PetscSNPrintf(component_name, sizeof component_name, "DataDrivenOutput%" CeedInt_FMT, i + 1)); - PetscCall(PetscSectionSetComponentName(section, 0, i, component_name)); - } - } - - PetscCall(DMGetDimension(user->dm, &dim)); - PetscCallCeed(ceed, CeedElemRestrictionGetNumComponents(ceed_data->elem_restr_x, &num_comp_x)); - PetscCallCeed(ceed, CeedElemRestrictionGetNumComponents(ceed_data->elem_restr_q, &num_comp_q)); - PetscCallCeed(ceed, CeedElemRestrictionGetNumComponents(sgs_dd_setup_data->elem_restr_grid_aniso, &num_comp_grid_aniso)); - - { // Get velocity gradient information - CeedOperatorField op_field; - PetscCallCeed(ceed, CeedOperatorGetFieldByName(user->grad_velo_proj->l2_rhs_ctx->op, "velocity gradient", &op_field)); - PetscCallCeed(ceed, CeedOperatorFieldGetElemRestriction(op_field, &elem_restr_grad_velo)); - PetscCallCeed(ceed, CeedElemRestrictionGetNumComponents(elem_restr_grad_velo, &num_comp_grad_velo)); - PetscCallCeed(ceed, CeedElemRestrictionCreateVector(elem_restr_grad_velo, &sgs_dd_data->grad_velo_ceed, NULL)); - } - - PetscCall(DMPlexCeedElemRestrictionCreate(ceed, sgs_dd_data->dm_sgs, domain_label, label_value, height, dm_field, &elem_restr_sgs)); - PetscCallCeed(ceed, CeedElemRestrictionCreateVector(elem_restr_sgs, &sgs_dd_data->sgs_nodal_ceed, NULL)); - PetscCall( - DMPlexCeedElemRestrictionCollocatedCreate(ceed, sgs_dd_data->dm_sgs, domain_label, label_value, height, num_comp_eigvec, &elem_restr_eigvec)); - PetscCallCeed(ceed, CeedElemRestrictionCreateVector(elem_restr_eigvec, &eigvec, NULL)); - - PetscCall(DMPlexCeedElemRestrictionCreate(ceed, sgs_dd_data->dm_dd_inputs, domain_label, label_value, height, dm_field, &elem_restr_dd_inputs)); - PetscCall(DMPlexCeedElemRestrictionCreate(ceed, sgs_dd_data->dm_dd_outputs, domain_label, label_value, height, dm_field, &elem_restr_dd_outputs)); - - PetscCall(GetInverseMultiplicity(ceed, sgs_dd_data->dm_sgs, domain_label, label_value, height, dm_field, PETSC_FALSE, &elem_restr_inv_multiplicity, - &inv_multiplicity)); - - { // Create operator for data-driven input evaluation - CeedQFunction qf_sgs_dd_inputs; - CeedOperator op_sgs_dd_inputs; - - switch (user->phys->state_var) { - case STATEVAR_PRIMITIVE: - PetscCallCeed(ceed, CeedQFunctionCreateInterior(ceed, 1, ComputeSgsDDNodal_Sequential_Inputs_Prim, - ComputeSgsDDNodal_Sequential_Inputs_Prim_loc, &qf_sgs_dd_inputs)); - break; - case STATEVAR_CONSERVATIVE: - PetscCallCeed(ceed, CeedQFunctionCreateInterior(ceed, 1, ComputeSgsDDNodal_Sequential_Inputs_Conserv, - ComputeSgsDDNodal_Sequential_Inputs_Conserv_loc, &qf_sgs_dd_inputs)); - break; - default: - SETERRQ(PetscObjectComm((PetscObject)user->dm), PETSC_ERR_SUP, - "Data-driven SGS nodal input evaluation not available for chosen state variable"); - } - - PetscCallCeed(ceed, CeedQFunctionSetContext(qf_sgs_dd_inputs, sgs_dd_setup_data->sgsdd_qfctx)); - PetscCallCeed(ceed, CeedQFunctionAddInput(qf_sgs_dd_inputs, "q", num_comp_q, CEED_EVAL_NONE)); - PetscCallCeed(ceed, CeedQFunctionAddInput(qf_sgs_dd_inputs, "gradient velocity", num_comp_grad_velo, CEED_EVAL_NONE)); - PetscCallCeed(ceed, CeedQFunctionAddInput(qf_sgs_dd_inputs, "anisotropy tensor", num_comp_grid_aniso, CEED_EVAL_NONE)); - PetscCallCeed(ceed, CeedQFunctionAddInput(qf_sgs_dd_inputs, "inverse multiplicity", 1, CEED_EVAL_NONE)); - PetscCallCeed(ceed, CeedQFunctionAddOutput(qf_sgs_dd_inputs, "eigenvectors", num_comp_eigvec, CEED_EVAL_NONE)); - PetscCallCeed(ceed, CeedQFunctionAddOutput(qf_sgs_dd_inputs, "model inputs", sgs_dd_data->num_comp_inputs, CEED_EVAL_NONE)); - - PetscCallCeed(ceed, CeedOperatorCreate(ceed, qf_sgs_dd_inputs, NULL, NULL, &op_sgs_dd_inputs)); - PetscCallCeed(ceed, CeedOperatorSetField(op_sgs_dd_inputs, "q", ceed_data->elem_restr_q, CEED_BASIS_NONE, user->q_ceed)); - PetscCallCeed(ceed, CeedOperatorSetField(op_sgs_dd_inputs, "gradient velocity", elem_restr_grad_velo, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE)); - PetscCallCeed(ceed, CeedOperatorSetField(op_sgs_dd_inputs, "anisotropy tensor", sgs_dd_setup_data->elem_restr_grid_aniso, CEED_BASIS_NONE, - sgs_dd_setup_data->grid_aniso_ceed)); - PetscCallCeed(ceed, - CeedOperatorSetField(op_sgs_dd_inputs, "inverse multiplicity", elem_restr_inv_multiplicity, CEED_BASIS_NONE, inv_multiplicity)); - PetscCallCeed(ceed, CeedOperatorSetField(op_sgs_dd_inputs, "eigenvectors", elem_restr_eigvec, CEED_BASIS_NONE, eigvec)); - PetscCallCeed(ceed, CeedOperatorSetField(op_sgs_dd_inputs, "model inputs", elem_restr_dd_inputs, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE)); - - PetscCall(OperatorApplyContextCreate(user->grad_velo_proj->dm, sgs_dd_data->dm_dd_inputs, ceed, op_sgs_dd_inputs, NULL, NULL, NULL, NULL, - &sgs_dd_data->op_nodal_dd_inputs_ctx)); - PetscCallCeed(ceed, CeedOperatorDestroy(&op_sgs_dd_inputs)); - PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_sgs_dd_inputs)); - } - - { // Create operator for data-driven output handling - CeedQFunction qf_sgs_dd_outputs; - CeedOperator op_sgs_dd_outputs; - - PetscCallCeed(ceed, CeedQFunctionCreateInterior(ceed, 1, ComputeSgsDDNodal_Sequential_Outputs, ComputeSgsDDNodal_Sequential_Outputs_loc, - &qf_sgs_dd_outputs)); - PetscCallCeed(ceed, CeedQFunctionSetContext(qf_sgs_dd_outputs, sgs_dd_setup_data->sgsdd_qfctx)); - PetscCallCeed(ceed, CeedQFunctionAddInput(qf_sgs_dd_outputs, "model outputs", sgs_dd_data->num_comp_outputs, CEED_EVAL_NONE)); - PetscCallCeed(ceed, CeedQFunctionAddInput(qf_sgs_dd_outputs, "anisotropy tensor", num_comp_grid_aniso, CEED_EVAL_NONE)); - PetscCallCeed(ceed, CeedQFunctionAddInput(qf_sgs_dd_outputs, "inverse multiplicity", 1, CEED_EVAL_NONE)); - PetscCallCeed(ceed, CeedQFunctionAddInput(qf_sgs_dd_outputs, "eigenvectors", num_comp_eigvec, CEED_EVAL_NONE)); - PetscCallCeed(ceed, CeedQFunctionAddOutput(qf_sgs_dd_outputs, "km_sgs", sgs_dd_data->num_comp_sgs, CEED_EVAL_NONE)); - - PetscCallCeed(ceed, CeedOperatorCreate(ceed, qf_sgs_dd_outputs, NULL, NULL, &op_sgs_dd_outputs)); - PetscCallCeed(ceed, CeedOperatorSetField(op_sgs_dd_outputs, "model outputs", elem_restr_dd_outputs, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE)); - PetscCallCeed(ceed, CeedOperatorSetField(op_sgs_dd_outputs, "anisotropy tensor", sgs_dd_setup_data->elem_restr_grid_aniso, CEED_BASIS_NONE, - sgs_dd_setup_data->grid_aniso_ceed)); - PetscCallCeed(ceed, - CeedOperatorSetField(op_sgs_dd_outputs, "inverse multiplicity", elem_restr_inv_multiplicity, CEED_BASIS_NONE, inv_multiplicity)); - PetscCallCeed(ceed, CeedOperatorSetField(op_sgs_dd_outputs, "eigenvectors", elem_restr_eigvec, CEED_BASIS_NONE, eigvec)); - PetscCallCeed(ceed, CeedOperatorSetField(op_sgs_dd_outputs, "km_sgs", elem_restr_sgs, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE)); - - PetscCall(OperatorApplyContextCreate(sgs_dd_data->dm_dd_outputs, sgs_dd_data->dm_sgs, ceed, op_sgs_dd_outputs, NULL, sgs_dd_data->sgs_nodal_ceed, - NULL, NULL, &sgs_dd_data->op_nodal_dd_outputs_ctx)); - PetscCallCeed(ceed, CeedOperatorDestroy(&op_sgs_dd_outputs)); - PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_sgs_dd_outputs)); - } - - sgs_dd_data->sgs_nodal_inference = SgsDDNodalStressEval_Sequential_Internal; - sgs_dd_data->sgs_nodal_eval = SgsDDNodalStressEval_Sequential; - PetscCall(SgsDDSetupNodalEvaluation_Sequential_Internal(ceed, sgs_dd_data, sgs_dd_setup_data, elem_restr_dd_inputs, elem_restr_dd_outputs, - elem_restr_inv_multiplicity, inv_multiplicity, &sgs_dd_data->sgs_nodal_inference_ctx)); - - sgs_dd_setup_data->elem_restr_sgs = elem_restr_sgs; - - PetscCallCeed(ceed, CeedVectorDestroy(&inv_multiplicity)); - PetscCallCeed(ceed, CeedVectorDestroy(&eigvec)); - PetscCallCeed(ceed, CeedElemRestrictionDestroy(&elem_restr_inv_multiplicity)); - PetscCallCeed(ceed, CeedElemRestrictionDestroy(&elem_restr_eigvec)); - PetscCallCeed(ceed, CeedElemRestrictionDestroy(&elem_restr_dd_inputs)); - PetscCallCeed(ceed, CeedElemRestrictionDestroy(&elem_restr_dd_outputs)); - PetscFunctionReturn(PETSC_SUCCESS); -} - -// @brief Create CeedOperator to compute SGS contribution to the residual -static PetscErrorCode SgsSetupNodalIFunction(Ceed ceed, User user, CeedData ceed_data, SgsDDSetupData sgs_dd_setup_data) { - SgsDDData sgs_dd_data = user->sgs_dd_data; - CeedInt num_comp_q, num_comp_qd, num_comp_x; - PetscInt dim; - CeedQFunction qf_sgs_apply; - CeedOperator op_sgs_apply; - CeedBasis basis_sgs; - - PetscFunctionBeginUser; - PetscCall(DMGetDimension(user->dm, &dim)); - PetscCallCeed(ceed, CeedElemRestrictionGetNumComponents(ceed_data->elem_restr_q, &num_comp_q)); - PetscCallCeed(ceed, CeedElemRestrictionGetNumComponents(ceed_data->elem_restr_qd_i, &num_comp_qd)); - PetscCallCeed(ceed, CeedElemRestrictionGetNumComponents(ceed_data->elem_restr_x, &num_comp_x)); - - PetscCall(CreateBasisFromPlex(ceed, sgs_dd_data->dm_sgs, 0, 0, 0, 0, &basis_sgs)); - - switch (user->phys->state_var) { - case STATEVAR_PRIMITIVE: - PetscCallCeed(ceed, CeedQFunctionCreateInterior(ceed, 1, IFunction_NodalSgs_Prim, IFunction_NodalSgs_Prim_loc, &qf_sgs_apply)); - break; - case STATEVAR_CONSERVATIVE: - PetscCallCeed(ceed, CeedQFunctionCreateInterior(ceed, 1, IFunction_NodalSgs_Conserv, IFunction_NodalSgs_Conserv_loc, &qf_sgs_apply)); - break; - default: - SETERRQ(PetscObjectComm((PetscObject)user->dm), PETSC_ERR_SUP, "Nodal SGS evaluation not available for chosen state variable"); - } - - PetscCallCeed(ceed, CeedQFunctionSetContext(qf_sgs_apply, sgs_dd_setup_data->ifunction_qfctx)); - PetscCallCeed(ceed, CeedQFunctionAddInput(qf_sgs_apply, "q", num_comp_q, CEED_EVAL_INTERP)); - PetscCallCeed(ceed, CeedQFunctionAddInput(qf_sgs_apply, "qdata", num_comp_qd, CEED_EVAL_NONE)); - PetscCallCeed(ceed, CeedQFunctionAddInput(qf_sgs_apply, "km_sgs", sgs_dd_data->num_comp_sgs, CEED_EVAL_INTERP)); - PetscCallCeed(ceed, CeedQFunctionAddOutput(qf_sgs_apply, "Grad_v", num_comp_q * dim, CEED_EVAL_GRAD)); - - PetscCallCeed(ceed, CeedOperatorCreate(ceed, qf_sgs_apply, NULL, NULL, &op_sgs_apply)); - PetscCallCeed(ceed, CeedOperatorSetField(op_sgs_apply, "q", ceed_data->elem_restr_q, ceed_data->basis_q, CEED_VECTOR_ACTIVE)); - PetscCallCeed(ceed, CeedOperatorSetField(op_sgs_apply, "qdata", ceed_data->elem_restr_qd_i, CEED_BASIS_NONE, ceed_data->q_data)); - PetscCallCeed(ceed, CeedOperatorSetField(op_sgs_apply, "km_sgs", sgs_dd_setup_data->elem_restr_sgs, basis_sgs, sgs_dd_data->sgs_nodal_ceed)); - PetscCallCeed(ceed, CeedOperatorSetField(op_sgs_apply, "Grad_v", ceed_data->elem_restr_q, ceed_data->basis_q, CEED_VECTOR_ACTIVE)); - - PetscCall( - OperatorApplyContextCreate(user->dm, user->dm, ceed, op_sgs_apply, user->q_ceed, user->g_ceed, NULL, NULL, &sgs_dd_data->op_sgs_apply_ctx)); - - PetscCallCeed(ceed, CeedOperatorDestroy(&op_sgs_apply)); - PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_sgs_apply)); - PetscFunctionReturn(PETSC_SUCCESS); -} - -// @brief Calculate and add data-driven SGS residual to the global residual -PetscErrorCode SgsDDApplyIFunction(User user, const Vec Q_loc, Vec G_loc) { - SgsDDData sgs_dd_data = user->sgs_dd_data; - Vec VelocityGradient, SGSNodal_loc; - PetscMemType sgs_nodal_mem_type; - - PetscFunctionBeginUser; - PetscCall(DMGetGlobalVector(user->grad_velo_proj->dm, &VelocityGradient)); - PetscCall(VelocityGradientProjectionApply(user->grad_velo_proj, Q_loc, VelocityGradient)); - - // -- Compute Nodal SGS tensor - PetscCall(DMGetLocalVector(sgs_dd_data->dm_sgs, &SGSNodal_loc)); - PetscCall(sgs_dd_data->sgs_nodal_eval(user, Q_loc, VelocityGradient, SGSNodal_loc)); - - // -- Compute contribution of the SGS stress - PetscCall(VecPetscToCeed(SGSNodal_loc, &sgs_nodal_mem_type, sgs_dd_data->sgs_nodal_ceed)); // sgs_nodal_ceed is an implicit input - PetscCall(ApplyAddCeedOperatorLocalToLocal(Q_loc, G_loc, sgs_dd_data->op_sgs_apply_ctx)); - - // -- Return local SGS vector - PetscCall(VecCeedToPetsc(sgs_dd_data->sgs_nodal_ceed, sgs_nodal_mem_type, SGSNodal_loc)); - PetscCall(DMRestoreLocalVector(sgs_dd_data->dm_sgs, &SGSNodal_loc)); - PetscCall(DMRestoreGlobalVector(user->grad_velo_proj->dm, &VelocityGradient)); - PetscFunctionReturn(PETSC_SUCCESS); -} - -// @brief B = A^T, A is NxM, B is MxN -static PetscErrorCode TransposeMatrix(const PetscScalar *A, PetscScalar *B, const PetscInt N, const PetscInt M) { - PetscFunctionBeginUser; - for (PetscInt i = 0; i < N; i++) { - for (PetscInt j = 0; j < M; j++) { - B[j * N + i] = A[i * M + j]; - } - } - PetscFunctionReturn(PETSC_SUCCESS); -} - -// @brief Read neural network coefficients from file and put into context struct -static PetscErrorCode SgsDDContextFill(MPI_Comm comm, char data_dir[PETSC_MAX_PATH_LEN], SgsDDContext *psgsdd_ctx) { - SgsDDContext sgsdd_ctx; - PetscInt num_inputs = (*psgsdd_ctx)->num_inputs, num_outputs = (*psgsdd_ctx)->num_outputs, num_neurons = (*psgsdd_ctx)->num_neurons; - char file_path[PETSC_MAX_PATH_LEN]; - PetscScalar *temp; - - PetscFunctionBeginUser; - { - SgsDDContext sgsdd_temp; - PetscCall(PetscNew(&sgsdd_temp)); - *sgsdd_temp = **psgsdd_ctx; - sgsdd_temp->offsets.bias1 = 0; - sgsdd_temp->offsets.bias2 = sgsdd_temp->offsets.bias1 + num_neurons; - sgsdd_temp->offsets.weight1 = sgsdd_temp->offsets.bias2 + num_neurons; - sgsdd_temp->offsets.weight2 = sgsdd_temp->offsets.weight1 + num_neurons * num_inputs; - sgsdd_temp->offsets.out_scaling = sgsdd_temp->offsets.weight2 + num_inputs * num_neurons; - PetscInt total_num_scalars = sgsdd_temp->offsets.out_scaling + 2 * num_outputs; - sgsdd_temp->total_bytes = sizeof(*sgsdd_ctx) + total_num_scalars * sizeof(sgsdd_ctx->data[0]); - PetscCall(PetscMalloc(sgsdd_temp->total_bytes, &sgsdd_ctx)); - *sgsdd_ctx = *sgsdd_temp; - PetscCall(PetscFree(sgsdd_temp)); - } - - PetscCall(PetscSNPrintf(file_path, sizeof file_path, "%s/%s", data_dir, "b1.dat")); - PetscCall(PhastaDatFileReadToArrayReal(comm, file_path, &sgsdd_ctx->data[sgsdd_ctx->offsets.bias1])); - PetscCall(PetscSNPrintf(file_path, sizeof file_path, "%s/%s", data_dir, "b2.dat")); - PetscCall(PhastaDatFileReadToArrayReal(comm, file_path, &sgsdd_ctx->data[sgsdd_ctx->offsets.bias2])); - PetscCall(PetscSNPrintf(file_path, sizeof file_path, "%s/%s", data_dir, "OutScaling.dat")); - PetscCall(PhastaDatFileReadToArrayReal(comm, file_path, &sgsdd_ctx->data[sgsdd_ctx->offsets.out_scaling])); - - { - PetscCall(PetscMalloc1(num_inputs * num_neurons, &temp)); - PetscCall(PetscSNPrintf(file_path, sizeof file_path, "%s/%s", data_dir, "w1.dat")); - PetscCall(PhastaDatFileReadToArrayReal(comm, file_path, temp)); - PetscCall(TransposeMatrix(temp, &sgsdd_ctx->data[sgsdd_ctx->offsets.weight1], num_inputs, num_neurons)); - PetscCall(PetscFree(temp)); - } - { - PetscCall(PetscMalloc1(num_outputs * num_neurons, &temp)); - PetscCall(PetscSNPrintf(file_path, sizeof file_path, "%s/%s", data_dir, "w2.dat")); - PetscCall(PhastaDatFileReadToArrayReal(comm, file_path, temp)); - PetscCall(TransposeMatrix(temp, &sgsdd_ctx->data[sgsdd_ctx->offsets.weight2], num_neurons, num_outputs)); - PetscCall(PetscFree(temp)); - } - - PetscCall(PetscFree(*psgsdd_ctx)); - *psgsdd_ctx = sgsdd_ctx; - PetscFunctionReturn(PETSC_SUCCESS); -} - -PetscErrorCode SgsDDSetup(Ceed ceed, User user, CeedData ceed_data, ProblemData problem) { - PetscReal alpha = 0; - SgsDDContext sgsdd_ctx; - MPI_Comm comm = user->comm; - char sgs_dd_dir[PETSC_MAX_PATH_LEN] = "./dd_sgs_parameters"; - SgsDDSetupData sgs_dd_setup_data; - PetscBool use_fused; - NewtonianIdealGasContext gas; - - PetscFunctionBeginUser; - PetscCall(VelocityGradientProjectionSetup(ceed, user, ceed_data, problem, user->phys->state_var, ceed_data->elem_restr_q, ceed_data->basis_q, - &user->grad_velo_proj)); - - PetscCall(PetscNew(&user->sgs_dd_data)); - user->sgs_dd_data->num_comp_inputs = 6; - user->sgs_dd_data->num_comp_outputs = 6; - - use_fused = PETSC_TRUE; - PetscOptionsBegin(comm, NULL, "SGS Data-Driven Model Options", NULL); - PetscCall(PetscOptionsReal("-sgs_model_dd_leakyrelu_alpha", "Slope parameter for Leaky ReLU activation function", NULL, alpha, &alpha, NULL)); - PetscCall(PetscOptionsString("-sgs_model_dd_parameter_dir", "Path to directory with model parameters (weights, biases, etc.)", NULL, sgs_dd_dir, - sgs_dd_dir, sizeof(sgs_dd_dir), NULL)); - PetscCall( - PetscOptionsBool("-sgs_model_dd_use_fused", "Use the fused SGS DD model evaluation instead of sequential", NULL, use_fused, &use_fused, NULL)); - PetscOptionsEnd(); - - PetscCall(PetscNew(&sgsdd_ctx)); - sgsdd_ctx->num_layers = 1; - sgsdd_ctx->num_inputs = 6; - sgsdd_ctx->num_outputs = 6; - sgsdd_ctx->num_neurons = 20; - sgsdd_ctx->alpha = alpha; - - PetscCall(SgsDDContextFill(comm, sgs_dd_dir, &sgsdd_ctx)); - - // -- Create DM for storing SGS tensor at nodes - PetscCall(SgsDDCreateDM(user->dm, &user->sgs_dd_data->dm_sgs, user->app_ctx->degree, user->app_ctx->q_extra, &user->sgs_dd_data->num_comp_sgs)); - - PetscCall(PetscNew(&sgs_dd_setup_data)); - - PetscCallCeed(ceed, CeedQFunctionContextGetDataRead(problem->apply_vol_ifunction.qfunction_context, CEED_MEM_HOST, &gas)); - sgsdd_ctx->gas = *gas; - PetscCallCeed(ceed, CeedQFunctionContextRestoreDataRead(problem->apply_vol_ifunction.qfunction_context, &gas)); - PetscCallCeed(ceed, CeedQFunctionContextCreate(user->ceed, &sgs_dd_setup_data->sgsdd_qfctx)); - PetscCallCeed(ceed, - CeedQFunctionContextSetData(sgs_dd_setup_data->sgsdd_qfctx, CEED_MEM_HOST, CEED_USE_POINTER, sgsdd_ctx->total_bytes, sgsdd_ctx)); - PetscCallCeed(ceed, CeedQFunctionContextSetDataDestroy(sgs_dd_setup_data->sgsdd_qfctx, CEED_MEM_HOST, FreeContextPetsc)); - - PetscCallCeed(ceed, CeedQFunctionContextReferenceCopy(problem->apply_vol_ifunction.qfunction_context, &sgs_dd_setup_data->ifunction_qfctx)); - - // -- Compute and store anisotropy tensor - PetscCall(GridAnisotropyTensorProjectionSetupApply(ceed, user, ceed_data, &sgs_dd_setup_data->elem_restr_grid_aniso, - &sgs_dd_setup_data->grid_aniso_ceed)); - - // -- Create Nodal Evaluation Operator - if (use_fused) PetscCall(SgsDDSetupNodalEvaluation_Fused(ceed, user, ceed_data, sgs_dd_setup_data)); - else PetscCall(SgsDDSetupNodalEvaluation_Sequential(ceed, user, ceed_data, sgs_dd_setup_data)); - - // -- Create Operator to evalutate residual of SGS stress - PetscCall(SgsSetupNodalIFunction(ceed, user, ceed_data, sgs_dd_setup_data)); - - PetscCall(SgsDDSetupDataDestroy(sgs_dd_setup_data)); - PetscFunctionReturn(PETSC_SUCCESS); -} - -PetscErrorCode SgsDDDataDestroy(SgsDDData sgs_dd_data) { - PetscFunctionBeginUser; - if (!sgs_dd_data) PetscFunctionReturn(PETSC_SUCCESS); - Ceed ceed = sgs_dd_data->op_sgs_apply_ctx->ceed; - - PetscCallCeed(ceed, CeedVectorDestroy(&sgs_dd_data->sgs_nodal_ceed)); - PetscCallCeed(ceed, CeedVectorDestroy(&sgs_dd_data->grad_velo_ceed)); - PetscCall(OperatorApplyContextDestroy(sgs_dd_data->op_nodal_evaluation_ctx)); - PetscCall(OperatorApplyContextDestroy(sgs_dd_data->op_sgs_apply_ctx)); - PetscCall(OperatorApplyContextDestroy(sgs_dd_data->op_nodal_dd_inputs_ctx)); - PetscCall(OperatorApplyContextDestroy(sgs_dd_data->op_nodal_dd_outputs_ctx)); - PetscCall(DMDestroy(&sgs_dd_data->dm_sgs)); - PetscCall(DMDestroy(&sgs_dd_data->dm_dd_inputs)); - PetscCall(DMDestroy(&sgs_dd_data->dm_dd_outputs)); - if (sgs_dd_data->sgs_nodal_inference_ctx) PetscCall(sgs_dd_data->sgs_nodal_inference_ctx_destroy(sgs_dd_data->sgs_nodal_inference_ctx)); - PetscCall(PetscFree(sgs_dd_data)); - PetscFunctionReturn(PETSC_SUCCESS); -} diff --git a/examples/fluids/problems/shocktube.c b/examples/fluids/problems/shocktube.c index 36d2b991e9..462f9cebb8 100644 --- a/examples/fluids/problems/shocktube.c +++ b/examples/fluids/problems/shocktube.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -14,7 +14,6 @@ #include #include "../navierstokes.h" -#include "../qfunctions/setupgeo.h" PetscErrorCode NS_SHOCKTUBE(ProblemData problem, DM dm, void *ctx, SimpleBC bc) { SetupContextShock setup_context; @@ -35,19 +34,13 @@ PetscErrorCode NS_SHOCKTUBE(ProblemData problem, DM dm, void *ctx, SimpleBC bc) // SET UP SHOCKTUBE // ------------------------------------------------------ problem->dim = 3; - problem->q_data_size_vol = 10; - problem->q_data_size_sur = 4; - problem->setup_vol.qfunction = Setup; - problem->setup_vol.qfunction_loc = Setup_loc; - problem->setup_sur.qfunction = SetupBoundary; - problem->setup_sur.qfunction_loc = SetupBoundary_loc; problem->ics.qfunction = ICsShockTube; problem->ics.qfunction_loc = ICsShockTube_loc; problem->apply_vol_rhs.qfunction = EulerShockTube; problem->apply_vol_rhs.qfunction_loc = EulerShockTube_loc; problem->apply_vol_ifunction.qfunction = NULL; problem->apply_vol_ifunction.qfunction_loc = NULL; - problem->non_zero_time = PETSC_FALSE; + problem->compute_exact_solution_error = PETSC_FALSE; problem->print_info = PRINT_SHOCKTUBE; // ------------------------------------------------------ diff --git a/examples/fluids/problems/stg_shur14.c b/examples/fluids/problems/stg_shur14.c index 5d9d4f1dc4..ca8dd2d10f 100644 --- a/examples/fluids/problems/stg_shur14.c +++ b/examples/fluids/problems/stg_shur14.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -99,6 +99,7 @@ static PetscErrorCode ReadStgInflow(const MPI_Comm comm, const char path[PETSC_M PetscCheck(wall_dist[i] >= 0, comm, PETSC_ERR_FILE_UNEXPECTED, "Distance to wall in %s cannot be negative", path); PetscCheck(lt[i] >= 0, comm, PETSC_ERR_FILE_UNEXPECTED, "Turbulent length scale in %s cannot be negative", path); PetscCheck(eps[i] >= 0, comm, PETSC_ERR_FILE_UNEXPECTED, "Turbulent dissipation in %s cannot be negative", path); + PetscCall(PetscStrToArrayDestroy(ndims, array)); } CeedScalar(*cij)[stg_ctx->nprofs] = (CeedScalar(*)[stg_ctx->nprofs]) & stg_ctx->data[stg_ctx->offsets.cij]; PetscCall(CalcCholeskyDecomp(comm, stg_ctx->nprofs, rij, cij)); @@ -144,6 +145,7 @@ static PetscErrorCode ReadStgRand(const MPI_Comm comm, const char path[PETSC_MAX sigma[0][i] = (CeedScalar)atof(array[4]); sigma[1][i] = (CeedScalar)atof(array[5]); sigma[2][i] = (CeedScalar)atof(array[6]); + PetscCall(PetscStrToArrayDestroy(ndims, array)); } PetscCall(PetscFClose(comm, fp)); PetscFunctionReturn(PETSC_SUCCESS); @@ -221,7 +223,7 @@ PetscErrorCode SetupStg(const MPI_Comm comm, const DM dm, ProblemData problem, U char stg_inflow_path[PETSC_MAX_PATH_LEN] = "./STGInflow.dat"; char stg_rand_path[PETSC_MAX_PATH_LEN] = "./STGRand.dat"; PetscBool mean_only = PETSC_FALSE, use_stgstrong = PETSC_FALSE, use_fluctuating_IC = PETSC_FALSE, given_stg_dx = PETSC_FALSE; - CeedScalar u0 = 0.0, alpha = 1.01, stg_dx = 1.0e-3; + CeedScalar u0 = 0.0, alpha = 1.01, stg_dx = -1, stg_h_scale_factor = 1 / user->app_ctx->degree; CeedQFunctionContext stg_context; NewtonianIdealGasContext newtonian_ig_ctx; @@ -235,7 +237,11 @@ PetscErrorCode SetupStg(const MPI_Comm comm, const DM dm, ProblemData problem, U PetscCall(PetscOptionsBool("-stg_strong", "Enforce STG inflow strongly", NULL, use_stgstrong, &use_stgstrong, NULL)); PetscCall(PetscOptionsBool("-stg_fluctuating_IC", "\"Extrude\" the fluctuations through the domain as an initial condition", NULL, use_fluctuating_IC, &use_fluctuating_IC, NULL)); - PetscCall(PetscOptionsReal("-stg_dx", "Element size in streamwise direction at inflow", NULL, stg_dx, &stg_dx, &given_stg_dx)); + PetscCall(PetscOptionsReal("-stg_dx", "Element length in x direction at inflow", NULL, stg_dx, &stg_dx, &given_stg_dx)); + PetscCall(PetscOptionsReal("-stg_h_scale_factor", "Scale element size for cutoff frequency calculation", NULL, stg_h_scale_factor, + &stg_h_scale_factor, NULL)); + PetscCall(PetscOptionsDeprecated("-stg_dyScale", NULL, "libCEED 0.12.0", "Use -stg_h_scale_factor to scale all the element dimensions")); + PetscCall(PetscOptionsDeprecated("-stg_dz", NULL, "libCEED 0.12.0", NULL)); PetscOptionsEnd(); PetscCall(PetscCalloc1(1, &global_stg_ctx)); @@ -247,6 +253,7 @@ PetscErrorCode SetupStg(const MPI_Comm comm, const DM dm, ProblemData problem, U global_stg_ctx->use_fluctuating_IC = use_fluctuating_IC; global_stg_ctx->theta0 = theta0; global_stg_ctx->P0 = P0; + global_stg_ctx->h_scale_factor = stg_h_scale_factor; { // Calculate dx assuming constant spacing PetscReal domain_min[3], domain_max[3], domain_size[3]; @@ -256,6 +263,8 @@ PetscErrorCode SetupStg(const MPI_Comm comm, const DM dm, ProblemData problem, U PetscInt nmax = 3, faces[3]; PetscCall(PetscOptionsGetIntArray(NULL, NULL, "-dm_plex_box_faces", faces, &nmax, NULL)); global_stg_ctx->dx = given_stg_dx ? stg_dx : domain_size[0] / faces[0]; + PetscCheck((global_stg_ctx->dx > 0) && PetscIsNormalReal((PetscReal)global_stg_ctx->dx), comm, PETSC_ERR_LIB, + "STG dx must be positive normal number, got %g", global_stg_ctx->dx); } PetscCallCeed(ceed, CeedQFunctionContextGetData(problem->apply_vol_rhs.qfunction_context, CEED_MEM_HOST, &newtonian_ig_ctx)); @@ -278,7 +287,7 @@ PetscErrorCode SetupStg(const MPI_Comm comm, const DM dm, ProblemData problem, U if (use_stgstrong) { // Use default boundary integral QF (BoundaryIntegral) in newtonian.h problem->use_strong_bc_ceed = PETSC_TRUE; - problem->bc_from_ics = PETSC_FALSE; + problem->set_bc_from_ics = PETSC_FALSE; } else { problem->apply_inflow.qfunction = StgShur14Inflow; problem->apply_inflow.qfunction_loc = StgShur14Inflow_loc; @@ -286,7 +295,7 @@ PetscErrorCode SetupStg(const MPI_Comm comm, const DM dm, ProblemData problem, U problem->apply_inflow_jacobian.qfunction_loc = StgShur14Inflow_Jacobian_loc; PetscCallCeed(ceed, CeedQFunctionContextReferenceCopy(stg_context, &problem->apply_inflow.qfunction_context)); PetscCallCeed(ceed, CeedQFunctionContextReferenceCopy(stg_context, &problem->apply_inflow_jacobian.qfunction_context)); - problem->bc_from_ics = PETSC_TRUE; + problem->set_bc_from_ics = PETSC_TRUE; } PetscFunctionReturn(PETSC_SUCCESS); } @@ -307,6 +316,11 @@ PetscErrorCode SetupStrongStg(DM dm, SimpleBC bc, ProblemData problem, Physics p // {1,2,3,4} for u, v, w, T for (int i = 0; i < 4; i++) comps[i] = i + 1; break; + + case STATEVAR_ENTROPY: + // {1,2,3,4} + for (int i = 0; i < 4; i++) comps[i] = i + 1; + break; } PetscCall(DMGetLabel(dm, "Face Sets", &label)); diff --git a/examples/fluids/problems/stg_shur14.h b/examples/fluids/problems/stg_shur14.h index ea2087af28..49fd6f1f6b 100644 --- a/examples/fluids/problems/stg_shur14.h +++ b/examples/fluids/problems/stg_shur14.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause diff --git a/examples/fluids/problems/taylorgreen.c b/examples/fluids/problems/taylorgreen.c index 9c090b593f..7a0e55f52c 100644 --- a/examples/fluids/problems/taylorgreen.c +++ b/examples/fluids/problems/taylorgreen.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause diff --git a/examples/fluids/pytorch_pkgconfig.py b/examples/fluids/pytorch_pkgconfig.py new file mode 100644 index 0000000000..8cfdb96c32 --- /dev/null +++ b/examples/fluids/pytorch_pkgconfig.py @@ -0,0 +1,71 @@ +from pathlib import Path +import torch +import torch.utils.cpp_extension as C +import torch.utils as tutils +import re + +build_dir = Path('./build') +if not build_dir.is_dir(): + build_dir.mkdir() +pkgconfig_path = build_dir / 'libtorch.pc' + +variables = {} +keywords = {} + + +def add_variable(file, variable, value): + file.write(f"{variable}={value}\n") + + +def add_keyword(file, key, value): + file.write(f"{key}: {value}\n") + + +variables['prefix'] = Path(C.library_paths()[0]).parent.as_posix() + +keywords['Name'] = 'libTorch' +keywords['Description'] = 'Custom made PC for PyTorch' +keywords['Version'] = torch.__version__ + +keywords['Cflags'] = '' +for include_path in C.include_paths(): + keywords['Cflags'] += f'-I{include_path} ' + +# Need to search the CMake file to see whether the library was compiled with the CXX11 ABI standard +regex_ABI = re.compile(r'"(\S*GLIBCXX_USE_CXX11_ABI\S*)"') +torchCMakePath = Path(tutils.cmake_prefix_path) / 'Torch/TorchConfig.cmake' +abi_flag = '' +with torchCMakePath.open('r') as f: + for line in f: + regex_result = regex_ABI.search(line) + if regex_result: + abi_flag = regex_result[1] + +keywords['Cflags'] += abi_flag + +keywords['Libs'] = '' +for lib_path in C.library_paths(): + keywords['Libs'] += f'-L{lib_path} ' +keywords['Libs'] += '-lc10 -ltorch_cpu ' +if torch.cuda.is_available(): + keywords['Libs'] += '-lc10_cuda -ltorch_cuda ' + # Need to force linking with libtorch_cuda.so, so find path and specify linking flag to force it + # This flag might be of limited portability + for lib_path in C.library_paths(): + torch_cuda_path = Path(lib_path) / 'libtorch_cuda.so' + if torch_cuda_path.exists(): + variables['torch_cuda_path'] = torch_cuda_path.as_posix() + keywords['Libs'] += f'-Wl,--no-as-needed,"{torch_cuda_path.as_posix()}" ' +keywords['Libs'] += '-ltorch ' +keywords['Libs.private'] = '' + +with pkgconfig_path.open('w') as file: + for variable, value in variables.items(): + add_variable(file, variable, value) + + file.write('\n') + + for keyword, value in keywords.items(): + add_keyword(file, keyword, value) + +print(pkgconfig_path.absolute()) diff --git a/examples/fluids/qfunctions/advection.h b/examples/fluids/qfunctions/advection.h index 43b5293837..486e0727ed 100644 --- a/examples/fluids/qfunctions/advection.h +++ b/examples/fluids/qfunctions/advection.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -7,8 +7,11 @@ /// @file /// Advection initial condition and operator for Navier-Stokes example using PETSc -#include +#include +#ifndef CEED_RUNNING_JIT_PASS #include +#include +#endif #include "advection_types.h" #include "newtonian_state.h" diff --git a/examples/fluids/qfunctions/advection_types.h b/examples/fluids/qfunctions/advection_types.h index 838995191c..ed008f0603 100644 --- a/examples/fluids/qfunctions/advection_types.h +++ b/examples/fluids/qfunctions/advection_types.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -6,7 +6,11 @@ // This file is part of CEED: http://github.com/ceed #pragma once -#include +#include +#ifndef CEED_RUNNING_JIT_PASS +#include +#endif + #include "stabilization_types.h" typedef enum { diff --git a/examples/fluids/qfunctions/bc_freestream.h b/examples/fluids/qfunctions/bc_freestream.h index 5fb4da2289..c348e9ab2e 100644 --- a/examples/fluids/qfunctions/bc_freestream.h +++ b/examples/fluids/qfunctions/bc_freestream.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -7,6 +7,10 @@ /// @file /// QFunctions for the `bc_freestream` and `bc_outflow` boundary conditions +#ifndef CEED_RUNNING_JIT_PASS +#include +#endif + #include "bc_freestream_type.h" #include "newtonian_state.h" #include "newtonian_types.h" @@ -30,17 +34,17 @@ CEED_QFUNCTION_HELPER int Freestream(void *ctx, CeedInt Q, const CeedScalar *con const CeedScalar qi[5] = {q[0][i], q[1][i], q[2][i], q[3][i], q[4][i]}; const State s = StateFromQ(newt_ctx, qi, state_var); - CeedScalar wdetJb, norm[3]; - QdataBoundaryUnpack_3D(Q, i, q_data_sur, &wdetJb, NULL, norm); + CeedScalar wdetJb, normal[3]; + QdataBoundaryUnpack_3D(Q, i, q_data_sur, &wdetJb, NULL, normal); wdetJb *= is_implicit ? -1. : 1.; StateConservative flux; switch (flux_type) { case RIEMANN_HLL: - flux = RiemannFlux_HLL(newt_ctx, s, context->S_infty, norm); + flux = RiemannFlux_HLL(newt_ctx, s, context->S_infty, normal); break; case RIEMANN_HLLC: - flux = RiemannFlux_HLLC(newt_ctx, s, context->S_infty, norm); + flux = RiemannFlux_HLLC(newt_ctx, s, context->S_infty, normal); break; } CeedScalar Flux[5]; @@ -64,6 +68,10 @@ CEED_QFUNCTION(Freestream_Prim_HLL)(void *ctx, CeedInt Q, const CeedScalar *cons return Freestream(ctx, Q, in, out, STATEVAR_PRIMITIVE, RIEMANN_HLL); } +CEED_QFUNCTION(Freestream_Entropy_HLL)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { + return Freestream(ctx, Q, in, out, STATEVAR_ENTROPY, RIEMANN_HLL); +} + CEED_QFUNCTION(Freestream_Conserv_HLLC)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { return Freestream(ctx, Q, in, out, STATEVAR_CONSERVATIVE, RIEMANN_HLLC); } @@ -72,6 +80,10 @@ CEED_QFUNCTION(Freestream_Prim_HLLC)(void *ctx, CeedInt Q, const CeedScalar *con return Freestream(ctx, Q, in, out, STATEVAR_PRIMITIVE, RIEMANN_HLLC); } +CEED_QFUNCTION(Freestream_Entropy_HLLC)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { + return Freestream(ctx, Q, in, out, STATEVAR_ENTROPY, RIEMANN_HLLC); +} + CEED_QFUNCTION_HELPER int Freestream_Jacobian(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out, StateVariable state_var, RiemannFluxType flux_type) { const CeedScalar(*dq)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[0]; @@ -86,8 +98,8 @@ CEED_QFUNCTION_HELPER int Freestream_Jacobian(void *ctx, CeedInt Q, const CeedSc const State dS_infty = {0}; CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { - CeedScalar wdetJb, norm[3]; - QdataBoundaryUnpack_3D(Q, i, q_data_sur, &wdetJb, NULL, norm); + CeedScalar wdetJb, normal[3]; + QdataBoundaryUnpack_3D(Q, i, q_data_sur, &wdetJb, NULL, normal); wdetJb *= is_implicit ? -1. : 1.; CeedScalar qi[5], dqi[5]; @@ -99,10 +111,10 @@ CEED_QFUNCTION_HELPER int Freestream_Jacobian(void *ctx, CeedInt Q, const CeedSc StateConservative dflux; switch (flux_type) { case RIEMANN_HLL: - dflux = RiemannFlux_HLL_fwd(newt_ctx, s, ds, context->S_infty, dS_infty, norm); + dflux = RiemannFlux_HLL_fwd(newt_ctx, s, ds, context->S_infty, dS_infty, normal); break; case RIEMANN_HLLC: - dflux = RiemannFlux_HLLC_fwd(newt_ctx, s, ds, context->S_infty, dS_infty, norm); + dflux = RiemannFlux_HLLC_fwd(newt_ctx, s, ds, context->S_infty, dS_infty, normal); break; } CeedScalar dFlux[5]; @@ -120,6 +132,10 @@ CEED_QFUNCTION(Freestream_Jacobian_Prim_HLL)(void *ctx, CeedInt Q, const CeedSca return Freestream_Jacobian(ctx, Q, in, out, STATEVAR_PRIMITIVE, RIEMANN_HLL); } +CEED_QFUNCTION(Freestream_Jacobian_Entropy_HLL)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { + return Freestream_Jacobian(ctx, Q, in, out, STATEVAR_ENTROPY, RIEMANN_HLL); +} + CEED_QFUNCTION(Freestream_Jacobian_Conserv_HLLC)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { return Freestream_Jacobian(ctx, Q, in, out, STATEVAR_CONSERVATIVE, RIEMANN_HLLC); } @@ -128,6 +144,10 @@ CEED_QFUNCTION(Freestream_Jacobian_Prim_HLLC)(void *ctx, CeedInt Q, const CeedSc return Freestream_Jacobian(ctx, Q, in, out, STATEVAR_PRIMITIVE, RIEMANN_HLLC); } +CEED_QFUNCTION(Freestream_Jacobian_Entropy_HLLC)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { + return Freestream_Jacobian(ctx, Q, in, out, STATEVAR_ENTROPY, RIEMANN_HLLC); +} + // Note the identity // // softplus(x) - x = log(1 + exp(x)) - x @@ -166,8 +186,8 @@ CEED_QFUNCTION_HELPER int RiemannOutflow(void *ctx, CeedInt Q, const CeedScalar const bool is_implicit = gas->is_implicit; CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { - CeedScalar wdetJb, dXdx[2][3], norm[3]; - QdataBoundaryUnpack_3D(Q, i, q_data_sur, &wdetJb, dXdx, norm); + CeedScalar wdetJb, dXdx[2][3], normal[3]; + QdataBoundaryUnpack_3D(Q, i, q_data_sur, &wdetJb, dXdx, normal); wdetJb *= is_implicit ? -1. : 1.; const CeedScalar qi[5] = {q[0][i], q[1][i], q[2][i], q[3][i], q[4][i]}; const State s_int = StateFromQ(gas, qi, state_var); @@ -175,10 +195,10 @@ CEED_QFUNCTION_HELPER int RiemannOutflow(void *ctx, CeedInt Q, const CeedScalar StatePrimitive y_ext = s_int.Y; y_ext.pressure = outflow->pressure; y_ext.temperature = outflow->temperature; - const CeedScalar u_normal = Dot3(y_ext.velocity, norm); + const CeedScalar u_normal = Dot3(y_ext.velocity, normal); const CeedScalar proj = (1 - outflow->recirc) * Softplus(-u_normal, outflow->softplus_velocity); for (CeedInt j = 0; j < 3; j++) { - y_ext.velocity[j] += norm[j] * proj; // (I - n n^T) projects into the plane tangent to the normal + y_ext.velocity[j] += normal[j] * proj; // (I - n n^T) projects into the plane tangent to the normal } State s_ext = StateFromPrimitive(gas, y_ext); @@ -191,10 +211,10 @@ CEED_QFUNCTION_HELPER int RiemannOutflow(void *ctx, CeedInt Q, const CeedScalar KMUnpack(kmstress, stress); ViscousEnergyFlux(gas, s_int.Y, grad_s, stress, Fe); - StateConservative F_inviscid_normal = RiemannFlux_HLLC(gas, s_int, s_ext, norm); + StateConservative F_inviscid_normal = RiemannFlux_HLLC(gas, s_int, s_ext, normal); CeedScalar Flux[5]; - FluxTotal_RiemannBoundary(F_inviscid_normal, stress, Fe, norm, Flux); + FluxTotal_RiemannBoundary(F_inviscid_normal, stress, Fe, normal, Flux); for (CeedInt j = 0; j < 5; j++) v[j][i] = -wdetJb * Flux[j]; @@ -215,6 +235,10 @@ CEED_QFUNCTION(RiemannOutflow_Prim)(void *ctx, CeedInt Q, const CeedScalar *cons return RiemannOutflow(ctx, Q, in, out, STATEVAR_PRIMITIVE); } +CEED_QFUNCTION(RiemannOutflow_Entropy)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { + return RiemannOutflow(ctx, Q, in, out, STATEVAR_ENTROPY); +} + // ***************************************************************************** // Jacobian for Riemann pressure/temperature outflow boundary condition // ***************************************************************************** @@ -231,8 +255,8 @@ CEED_QFUNCTION_HELPER int RiemannOutflow_Jacobian(void *ctx, CeedInt Q, const Ce const bool is_implicit = gas->is_implicit; CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { - CeedScalar wdetJb, dXdx[2][3], norm[3]; - QdataBoundaryUnpack_3D(Q, i, q_data_sur, &wdetJb, dXdx, norm); + CeedScalar wdetJb, dXdx[2][3], normal[3]; + QdataBoundaryUnpack_3D(Q, i, q_data_sur, &wdetJb, dXdx, normal); wdetJb *= is_implicit ? -1. : 1.; CeedScalar qi[5], kmstress[6], dqi[5]; @@ -247,13 +271,13 @@ CEED_QFUNCTION_HELPER int RiemannOutflow_Jacobian(void *ctx, CeedInt Q, const Ce y_ext.temperature = outflow->temperature; dy_ext.pressure = 0; dy_ext.temperature = 0; - const CeedScalar u_normal = Dot3(s_int.Y.velocity, norm); - const CeedScalar du_normal = Dot3(ds_int.Y.velocity, norm); + const CeedScalar u_normal = Dot3(s_int.Y.velocity, normal); + const CeedScalar du_normal = Dot3(ds_int.Y.velocity, normal); const CeedScalar proj = (1 - outflow->recirc) * Softplus(-u_normal, outflow->softplus_velocity); const CeedScalar dproj = (1 - outflow->recirc) * Softplus_fwd(-u_normal, -du_normal, outflow->softplus_velocity); for (CeedInt j = 0; j < 3; j++) { - y_ext.velocity[j] += norm[j] * proj; - dy_ext.velocity[j] += norm[j] * dproj; + y_ext.velocity[j] += normal[j] * proj; + dy_ext.velocity[j] += normal[j] * dproj; } State s_ext = StateFromPrimitive(gas, y_ext); @@ -269,10 +293,10 @@ CEED_QFUNCTION_HELPER int RiemannOutflow_Jacobian(void *ctx, CeedInt Q, const Ce KMUnpack(kmstress, stress); ViscousEnergyFlux_fwd(gas, s_int.Y, ds_int.Y, grad_ds, stress, dstress, dFe); - StateConservative dF_inviscid_normal = RiemannFlux_HLLC_fwd(gas, s_int, ds_int, s_ext, ds_ext, norm); + StateConservative dF_inviscid_normal = RiemannFlux_HLLC_fwd(gas, s_int, ds_int, s_ext, ds_ext, normal); CeedScalar dFlux[5]; - FluxTotal_RiemannBoundary(dF_inviscid_normal, dstress, dFe, norm, dFlux); + FluxTotal_RiemannBoundary(dF_inviscid_normal, dstress, dFe, normal, dFlux); for (int j = 0; j < 5; j++) v[j][i] = -wdetJb * dFlux[j]; } @@ -287,6 +311,10 @@ CEED_QFUNCTION(RiemannOutflow_Jacobian_Prim)(void *ctx, CeedInt Q, const CeedSca return RiemannOutflow_Jacobian(ctx, Q, in, out, STATEVAR_PRIMITIVE); } +CEED_QFUNCTION(RiemannOutflow_Jacobian_Entropy)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { + return RiemannOutflow_Jacobian(ctx, Q, in, out, STATEVAR_ENTROPY); +} + // ***************************************************************************** // Outflow boundary condition, weakly setting a constant pressure. This is the // classic outflow condition used by PHASTA-C and retained largely for @@ -310,8 +338,8 @@ CEED_QFUNCTION_HELPER int PressureOutflow(void *ctx, CeedInt Q, const CeedScalar State s = StateFromQ(gas, qi, state_var); s.Y.pressure = outflow->pressure; - CeedScalar wdetJb, dXdx[2][3], norm[3]; - QdataBoundaryUnpack_3D(Q, i, q_data_sur, &wdetJb, dXdx, norm); + CeedScalar wdetJb, dXdx[2][3], normal[3]; + QdataBoundaryUnpack_3D(Q, i, q_data_sur, &wdetJb, dXdx, normal); wdetJb *= is_implicit ? -1. : 1.; State grad_s[3]; @@ -327,7 +355,7 @@ CEED_QFUNCTION_HELPER int PressureOutflow(void *ctx, CeedInt Q, const CeedScalar FluxInviscid(gas, s, F_inviscid); CeedScalar Flux[5]; - FluxTotal_Boundary(F_inviscid, stress, Fe, norm, Flux); + FluxTotal_Boundary(F_inviscid, stress, Fe, normal, Flux); for (CeedInt j = 0; j < 5; j++) v[j][i] = -wdetJb * Flux[j]; @@ -348,6 +376,10 @@ CEED_QFUNCTION(PressureOutflow_Prim)(void *ctx, CeedInt Q, const CeedScalar *con return PressureOutflow(ctx, Q, in, out, STATEVAR_PRIMITIVE); } +CEED_QFUNCTION(PressureOutflow_Entropy)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { + return PressureOutflow(ctx, Q, in, out, STATEVAR_ENTROPY); +} + // ***************************************************************************** // Jacobian for weak-pressure outflow boundary condition // ***************************************************************************** @@ -364,8 +396,8 @@ CEED_QFUNCTION_HELPER int PressureOutflow_Jacobian(void *ctx, CeedInt Q, const C const bool is_implicit = gas->is_implicit; CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { - CeedScalar wdetJb, dXdx[2][3], norm[3]; - QdataBoundaryUnpack_3D(Q, i, q_data_sur, &wdetJb, dXdx, norm); + CeedScalar wdetJb, dXdx[2][3], normal[3]; + QdataBoundaryUnpack_3D(Q, i, q_data_sur, &wdetJb, dXdx, normal); wdetJb *= is_implicit ? -1. : 1.; CeedScalar qi[5], kmstress[6], dqi[5]; @@ -392,7 +424,7 @@ CEED_QFUNCTION_HELPER int PressureOutflow_Jacobian(void *ctx, CeedInt Q, const C FluxInviscid_fwd(gas, s, ds, dF_inviscid); CeedScalar dFlux[5]; - FluxTotal_Boundary(dF_inviscid, dstress, dFe, norm, dFlux); + FluxTotal_Boundary(dF_inviscid, dstress, dFe, normal, dFlux); for (int j = 0; j < 5; j++) v[j][i] = -wdetJb * dFlux[j]; } @@ -406,3 +438,7 @@ CEED_QFUNCTION(PressureOutflow_Jacobian_Conserv)(void *ctx, CeedInt Q, const Cee CEED_QFUNCTION(PressureOutflow_Jacobian_Prim)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { return PressureOutflow_Jacobian(ctx, Q, in, out, STATEVAR_PRIMITIVE); } + +CEED_QFUNCTION(PressureOutflow_Jacobian_Entropy)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { + return PressureOutflow_Jacobian(ctx, Q, in, out, STATEVAR_ENTROPY); +} diff --git a/examples/fluids/qfunctions/bc_freestream_type.h b/examples/fluids/qfunctions/bc_freestream_type.h index 8c30ca2915..62a3fa1c4c 100644 --- a/examples/fluids/qfunctions/bc_freestream_type.h +++ b/examples/fluids/qfunctions/bc_freestream_type.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause diff --git a/examples/fluids/qfunctions/bc_slip.h b/examples/fluids/qfunctions/bc_slip.h index 816d4957ca..5a77f3727e 100644 --- a/examples/fluids/qfunctions/bc_slip.h +++ b/examples/fluids/qfunctions/bc_slip.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -56,6 +56,10 @@ CEED_QFUNCTION(Slip_Prim)(void *ctx, CeedInt Q, const CeedScalar *const *in, Cee return Slip(ctx, Q, in, out, STATEVAR_PRIMITIVE); } +CEED_QFUNCTION(Slip_Entropy)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { + return Slip(ctx, Q, in, out, STATEVAR_ENTROPY); +} + CEED_QFUNCTION_HELPER int Slip_Jacobian(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out, StateVariable state_var) { const CeedScalar(*dq)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[0]; const CeedScalar(*q_data_sur) = in[2]; @@ -104,3 +108,7 @@ CEED_QFUNCTION(Slip_Jacobian_Conserv)(void *ctx, CeedInt Q, const CeedScalar *co CEED_QFUNCTION(Slip_Jacobian_Prim)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { return Slip_Jacobian(ctx, Q, in, out, STATEVAR_PRIMITIVE); } + +CEED_QFUNCTION(Slip_Jacobian_Entropy)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { + return Slip_Jacobian(ctx, Q, in, out, STATEVAR_ENTROPY); +} diff --git a/examples/fluids/qfunctions/blasius.h b/examples/fluids/qfunctions/blasius.h index 52a7ff5614..20e4f4c72b 100644 --- a/examples/fluids/qfunctions/blasius.h +++ b/examples/fluids/qfunctions/blasius.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -7,7 +7,10 @@ /// @file /// Operator for Navier-Stokes example using PETSc -#include +#include +#ifndef CEED_RUNNING_JIT_PASS +#include +#endif #include "newtonian_state.h" #include "newtonian_types.h" @@ -17,13 +20,11 @@ typedef struct BlasiusContext_ *BlasiusContext; struct BlasiusContext_ { - bool implicit; // !< Using implicit timesteping or not - bool weakT; // !< flag to set Temperature weakly at inflow - CeedScalar delta0; // !< Boundary layer height at inflow - CeedScalar U_inf; // !< Velocity at boundary layer edge - CeedScalar T_inf; // !< Temperature at boundary layer edge + bool implicit; // !< Using implicit timesteping or not + bool weakT; // !< flag to set Temperature weakly at inflow + CeedScalar delta0; // !< Boundary layer height at inflow + State S_infty; CeedScalar T_wall; // !< Temperature at the wall - CeedScalar P0; // !< Pressure at outflow CeedScalar x_inflow; // !< Location of inflow in x CeedScalar n_cheb; // !< Number of Chebyshev terms CeedScalar *X; // !< Chebyshev polynomial coordinate vector (CPU only) @@ -39,7 +40,7 @@ struct BlasiusContext_ { CEED_QFUNCTION_HELPER void ChebyshevEval(int N, const double *Tf, double x, double eta_max, double *f) { double dX_deta = 2 / eta_max; double table[4][3] = { - // Chebyshev polynomials T_0, T_1, T_2 of the first kind in (-1,1) + // Chebyshev polynomials T_0, T_1, T_2 of the first kind in (-1,1) {1, x, 2 * x * x - 1}, {0, 1, 4 * x }, {0, 0, 4 }, @@ -72,25 +73,26 @@ CEED_QFUNCTION_HELPER void ChebyshevEval(int N, const double *Tf, double x, doub // ***************************************************************************** State CEED_QFUNCTION_HELPER(BlasiusSolution)(const BlasiusContext blasius, const CeedScalar x[3], const CeedScalar x0, const CeedScalar x_inflow, const CeedScalar rho_infty, CeedScalar *t12) { - CeedInt N = blasius->n_cheb; - CeedScalar mu = blasius->newtonian_ctx.mu; - CeedScalar nu = mu / rho_infty; - CeedScalar eta = x[1] * sqrt(blasius->U_inf / (nu * (x0 + x[0] - x_inflow))); - CeedScalar X = 2 * (eta / blasius->eta_max) - 1.; - CeedScalar U_inf = blasius->U_inf; - CeedScalar Rd = GasConstant(&blasius->newtonian_ctx); + CeedInt N = blasius->n_cheb; + CeedScalar mu = blasius->newtonian_ctx.mu; + State S_infty = blasius->S_infty; + CeedScalar nu = mu / rho_infty; + CeedScalar U_infty = sqrt(Dot3(S_infty.Y.velocity, S_infty.Y.velocity)); + CeedScalar eta = x[1] * sqrt(U_infty / (nu * (x0 + x[0] - x_inflow))); + CeedScalar X = 2 * (eta / blasius->eta_max) - 1.; + CeedScalar Rd = GasConstant(&blasius->newtonian_ctx); CeedScalar f[4], h[4]; ChebyshevEval(N, blasius->Tf_cheb, X, blasius->eta_max, f); ChebyshevEval(N - 1, blasius->Th_cheb, X, blasius->eta_max, h); - *t12 = mu * U_inf * f[2] * sqrt(U_inf / (nu * (x0 + x[0] - x_inflow))); + *t12 = mu * U_infty * f[2] * sqrt(U_infty / (nu * (x0 + x[0] - x_inflow))); CeedScalar Y[5]; - Y[1] = U_inf * f[1]; - Y[2] = 0.5 * sqrt(nu * U_inf / (x0 + x[0] - x_inflow)) * (eta * f[1] - f[0]); + Y[1] = U_infty * f[1]; + Y[2] = 0.5 * sqrt(nu * U_infty / (x0 + x[0] - x_inflow)) * (eta * f[1] - f[0]); Y[3] = 0.; - Y[4] = blasius->T_inf * h[0]; + Y[4] = S_infty.Y.temperature * h[0]; Y[0] = rho_infty / h[0] * Rd * Y[4]; return StateFromY(&blasius->newtonian_ctx, Y); } @@ -109,24 +111,17 @@ CEED_QFUNCTION(ICsBlasius)(void *ctx, CeedInt Q, const CeedScalar *const *in, Ce const CeedScalar x_inflow = context->x_inflow; CeedScalar t12; - const CeedScalar Y_inf[5] = {context->P0, context->U_inf, 0, 0, context->T_inf}; - const State s_inf = StateFromY(gas, Y_inf); + const State S_infty = context->S_infty; + const CeedScalar U_infty = sqrt(Dot3(S_infty.Y.velocity, S_infty.Y.velocity)); - const CeedScalar x0 = context->U_inf * s_inf.U.density / (mu * 25 / Square(delta0)); + const CeedScalar x0 = U_infty * S_infty.U.density / (mu * 25 / Square(delta0)); CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { const CeedScalar x[3] = {X[0][i], X[1][i], X[2][i]}; - State s = BlasiusSolution(context, x, x0, x_inflow, s_inf.U.density, &t12); + State s = BlasiusSolution(context, x, x0, x_inflow, S_infty.U.density, &t12); CeedScalar q[5] = {0}; - switch (gas->state_var) { - case STATEVAR_CONSERVATIVE: - UnpackState_U(s.U, q); - break; - case STATEVAR_PRIMITIVE: - UnpackState_Y(s.Y, q); - break; - } + StateToQ(gas, s, q, gas->state_var); for (CeedInt j = 0; j < 5; j++) q0[j][i] = q[j]; } return 0; @@ -143,8 +138,10 @@ CEED_QFUNCTION(Blasius_Inflow)(void *ctx, CeedInt Q, const CeedScalar *const *in const bool is_implicit = context->implicit; const NewtonianIdealGasContext gas = &context->newtonian_ctx; - const CeedScalar rho_0 = context->P0 / (GasConstant(gas) * context->T_inf); - const CeedScalar x0 = context->U_inf * rho_0 / (gas->mu * 25 / Square(context->delta0)); + State S_infty = context->S_infty; + const CeedScalar rho_0 = S_infty.U.density; + const CeedScalar U_infty = sqrt(Dot3(S_infty.Y.velocity, S_infty.Y.velocity)); + const CeedScalar x0 = U_infty * rho_0 / (gas->mu * 25 / Square(context->delta0)); const CeedScalar zeros[11] = {0.}; CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { @@ -198,8 +195,10 @@ CEED_QFUNCTION(Blasius_Inflow_Jacobian)(void *ctx, CeedInt Q, const CeedScalar * const bool is_implicit = context->implicit; const CeedScalar Rd = GasConstant(gas); const CeedScalar gamma = HeatCapacityRatio(gas); - const CeedScalar rho_0 = context->P0 / (Rd * context->T_inf); - const CeedScalar x0 = context->U_inf * rho_0 / (gas->mu * 25 / (Square(context->delta0))); + const State S_infty = context->S_infty; + const CeedScalar rho_0 = S_infty.U.density; + const CeedScalar U_infty = sqrt(Dot3(S_infty.Y.velocity, S_infty.Y.velocity)); + const CeedScalar x0 = U_infty * rho_0 / (gas->mu * 25 / Square(context->delta0)); CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { CeedScalar wdetJb, norm[3]; @@ -216,11 +215,12 @@ CEED_QFUNCTION(Blasius_Inflow_Jacobian)(void *ctx, CeedInt Q, const CeedScalar * if (context->weakT) { // rho should be from the current solution drho = dq[0][i]; - CeedScalar dE_internal = drho * gas->cv * context->T_inf; + CeedScalar dE_internal = drho * gas->cv * S_infty.Y.temperature; CeedScalar dE_kinetic = .5 * drho * Dot3(s.Y.velocity, s.Y.velocity); dE = dE_internal + dE_kinetic; - dP = drho * Rd * context->T_inf; // interior rho with exterior T - } else { // rho specified, E_internal from solution + dP = drho * Rd * S_infty.Y.temperature; // interior rho with exterior T + } else { + // rho specified, E_internal from solution drho = 0; dE = dq[4][i]; dP = dE * (gamma - 1.); diff --git a/examples/fluids/qfunctions/channel.h b/examples/fluids/qfunctions/channel.h index 7634696c74..9595c81701 100644 --- a/examples/fluids/qfunctions/channel.h +++ b/examples/fluids/qfunctions/channel.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -7,8 +7,11 @@ /// @file /// Operator for Navier-Stokes example using PETSc -#include +#include +#ifndef CEED_RUNNING_JIT_PASS #include +#include +#endif #include "newtonian_state.h" #include "newtonian_types.h" @@ -64,21 +67,14 @@ CEED_QFUNCTION(ICsChannel)(void *ctx, CeedInt Q, const CeedScalar *const *in, Ce const CeedScalar(*X)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[0]; CeedScalar(*q0)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[0]; - const ChannelContext context = (ChannelContext)ctx; + const ChannelContext context = (ChannelContext)ctx; + const NewtonianIdealGasContext gas = &context->newtonian_ctx; CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { const CeedScalar x[] = {X[0][i], X[1][i], X[2][i]}; State s = Exact_Channel(3, 0., x, 5, ctx); CeedScalar q[5] = {0}; - switch (context->newtonian_ctx.state_var) { - case STATEVAR_CONSERVATIVE: - UnpackState_U(s.U, q); - break; - case STATEVAR_PRIMITIVE: - UnpackState_Y(s.Y, q); - break; - } - + StateToQ(gas, s, q, gas->state_var); for (CeedInt j = 0; j < 5; j++) q0[j][i] = q[j]; } return 0; diff --git a/examples/fluids/qfunctions/densitycurrent.h b/examples/fluids/qfunctions/densitycurrent.h index 4d10261b4f..69fe9488fd 100644 --- a/examples/fluids/qfunctions/densitycurrent.h +++ b/examples/fluids/qfunctions/densitycurrent.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -11,8 +11,10 @@ // Model from: // Semi-Implicit Formulations of the Navier-Stokes Equations: Application to // Nonhydrostatic Atmospheric Modeling, Giraldo, Restelli, and Lauter (2010). -#include +#include +#ifndef CEED_RUNNING_JIT_PASS #include +#endif #include "newtonian_state.h" #include "newtonian_types.h" @@ -133,21 +135,14 @@ CEED_QFUNCTION(ICsDC)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedSca const CeedScalar(*X)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[0]; CeedScalar(*q0)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[0]; - const DensityCurrentContext context = (DensityCurrentContext)ctx; + const DensityCurrentContext context = (DensityCurrentContext)ctx; + const NewtonianIdealGasContext gas = &context->newtonian_ctx; CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { const CeedScalar x[] = {X[0][i], X[1][i], X[2][i]}; State s = Exact_DC(3, 0., x, 5, ctx); CeedScalar q[5] = {0}; - switch (context->newtonian_ctx.state_var) { - case STATEVAR_CONSERVATIVE: - UnpackState_U(s.U, q); - break; - case STATEVAR_PRIMITIVE: - UnpackState_Y(s.Y, q); - break; - } - + StateToQ(gas, s, q, gas->state_var); for (CeedInt j = 0; j < 5; j++) q0[j][i] = q[j]; } return 0; diff --git a/examples/fluids/qfunctions/differential_filter.h b/examples/fluids/qfunctions/differential_filter.h index 703bc2bfc8..a983cd7a63 100644 --- a/examples/fluids/qfunctions/differential_filter.h +++ b/examples/fluids/qfunctions/differential_filter.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -7,7 +7,10 @@ // /// @file /// Implementation of differential filtering -#include +#include +#ifndef CEED_RUNNING_JIT_PASS +#include +#endif #include "differential_filter_enums.h" #include "newtonian_state.h" @@ -64,6 +67,10 @@ CEED_QFUNCTION(DifferentialFilter_RHS_Prim)(void *ctx, CeedInt Q, const CeedScal return DifferentialFilter_RHS(ctx, Q, in, out, STATEVAR_PRIMITIVE); } +CEED_QFUNCTION(DifferentialFilter_RHS_Entropy)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { + return DifferentialFilter_RHS(ctx, Q, in, out, STATEVAR_ENTROPY); +} + CEED_QFUNCTION_HELPER CeedScalar VanDriestWallDamping(const CeedScalar wall_dist_plus, const CeedScalar A_plus) { return -expm1(-wall_dist_plus / A_plus); } diff --git a/examples/fluids/qfunctions/differential_filter_enums.h b/examples/fluids/qfunctions/differential_filter_enums.h index ffa548fff6..9c000c3b9d 100644 --- a/examples/fluids/qfunctions/differential_filter_enums.h +++ b/examples/fluids/qfunctions/differential_filter_enums.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause diff --git a/examples/fluids/qfunctions/eulervortex.h b/examples/fluids/qfunctions/eulervortex.h index 308cb50cea..2f6c0ad003 100644 --- a/examples/fluids/qfunctions/eulervortex.h +++ b/examples/fluids/qfunctions/eulervortex.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -11,8 +11,11 @@ // Model from: // On the Order of Accuracy and Numerical Performance of Two Classes of Finite Volume WENO Schemes, Zhang, Zhang, and Shu (2011). -#include +#include +#ifndef CEED_RUNNING_JIT_PASS #include +#include +#endif #include "utils.h" diff --git a/examples/fluids/qfunctions/gaussianwave.h b/examples/fluids/qfunctions/gaussianwave.h index 4115d86a81..0bf6b612b4 100644 --- a/examples/fluids/qfunctions/gaussianwave.h +++ b/examples/fluids/qfunctions/gaussianwave.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -7,8 +7,10 @@ /// @file /// Thermodynamic wave propogation for testing freestream/non-reflecting boundary conditions. Proposed in Mengaldo et. al. 2014 -#include +#include +#ifndef CEED_RUNNING_JIT_PASS #include +#endif #include "newtonian_state.h" #include "utils.h" @@ -69,3 +71,7 @@ CEED_QFUNCTION(IC_GaussianWave_Conserv)(void *ctx, CeedInt Q, const CeedScalar * CEED_QFUNCTION(IC_GaussianWave_Prim)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { return IC_GaussianWave(ctx, Q, in, out, STATEVAR_PRIMITIVE); } + +CEED_QFUNCTION(IC_GaussianWave_Entropy)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { + return IC_GaussianWave(ctx, Q, in, out, STATEVAR_ENTROPY); +} diff --git a/examples/fluids/qfunctions/grid_anisotropy_tensor.h b/examples/fluids/qfunctions/grid_anisotropy_tensor.h index ef59a54c6d..80078afcd4 100644 --- a/examples/fluids/qfunctions/grid_anisotropy_tensor.h +++ b/examples/fluids/qfunctions/grid_anisotropy_tensor.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -8,7 +8,7 @@ /// @file /// Element anisotropy tensor, as defined in 'Invariant data-driven subgrid stress modeling in the strain-rate eigenframe for large eddy simulation' /// Prakash et al. 2022 -#include +#include #include "utils.h" #include "utils_eigensolver_jacobi.h" diff --git a/examples/fluids/qfunctions/inverse_multiplicity.h b/examples/fluids/qfunctions/inverse_multiplicity.h index c51fc0586b..2c4a5ef335 100644 --- a/examples/fluids/qfunctions/inverse_multiplicity.h +++ b/examples/fluids/qfunctions/inverse_multiplicity.h @@ -1,10 +1,10 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause // // This file is part of CEED: http://github.com/ceed -#include +#include // @brief Calculate the inverse of the multiplicity, reducing to a single component CEED_QFUNCTION(InverseMultiplicity)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { diff --git a/examples/fluids/qfunctions/mass.h b/examples/fluids/qfunctions/mass.h index 1147a2bb31..81de13d16e 100644 --- a/examples/fluids/qfunctions/mass.h +++ b/examples/fluids/qfunctions/mass.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -7,8 +7,10 @@ /// @file /// Mass operator for Navier-Stokes example using PETSc -#include +#include +#ifndef CEED_RUNNING_JIT_PASS #include +#endif // ***************************************************************************** // This QFunction applies the mass matrix to five interlaced fields. diff --git a/examples/fluids/qfunctions/newtonian.h b/examples/fluids/qfunctions/newtonian.h index 1068519217..fa470e5cd1 100644 --- a/examples/fluids/qfunctions/newtonian.h +++ b/examples/fluids/qfunctions/newtonian.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -7,9 +7,11 @@ /// @file /// Operator for Navier-Stokes example using PETSc -#include +#include +#ifndef CEED_RUNNING_JIT_PASS #include #include +#endif #include "newtonian_state.h" #include "newtonian_types.h" @@ -43,11 +45,16 @@ CEED_QFUNCTION_HELPER int ICsNewtonianIG(void *ctx, CeedInt Q, const CeedScalar return 0; } +CEED_QFUNCTION(ICsNewtonianIG_Conserv)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { + return ICsNewtonianIG(ctx, Q, in, out, STATEVAR_CONSERVATIVE); +} + CEED_QFUNCTION(ICsNewtonianIG_Prim)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { return ICsNewtonianIG(ctx, Q, in, out, STATEVAR_PRIMITIVE); } -CEED_QFUNCTION(ICsNewtonianIG_Conserv)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { - return ICsNewtonianIG(ctx, Q, in, out, STATEVAR_CONSERVATIVE); + +CEED_QFUNCTION(ICsNewtonianIG_Entropy)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { + return ICsNewtonianIG(ctx, Q, in, out, STATEVAR_ENTROPY); } CEED_QFUNCTION_HELPER void MassFunction_Newtonian(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out, @@ -143,15 +150,18 @@ CEED_QFUNCTION(RHSFunction_Newtonian)(void *ctx, CeedInt Q, const CeedScalar *co const CeedScalar(*q)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[0]; const CeedScalar(*Grad_q) = in[1]; const CeedScalar(*q_data) = in[2]; + const CeedScalar(*x)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[3]; CeedScalar(*v)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[0]; CeedScalar(*Grad_v)[5][CEED_Q_VLA] = (CeedScalar(*)[5][CEED_Q_VLA])out[1]; NewtonianIdealGasContext context = (NewtonianIdealGasContext)ctx; const CeedScalar *g = context->g; const CeedScalar dt = context->dt; + const CeedScalar P0 = context->idl_pressure; CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { - CeedScalar U[5], wdetJ, dXdx[3][3]; + CeedScalar U[5], wdetJ, dXdx[3][3]; + const CeedScalar x_i[3] = {x[0][i], x[1][i], x[2][i]}; for (int j = 0; j < 5; j++) U[j] = q[j][i]; QdataUnpack_3D(Q, i, q_data, &wdetJ, dXdx); State s = StateFromU(context, U); @@ -179,6 +189,13 @@ CEED_QFUNCTION(RHSFunction_Newtonian)(void *ctx, CeedInt Q, const CeedScalar *co const CeedScalar body_force[5] = {0, s.U.density * g[0], s.U.density * g[1], s.U.density * g[2], Dot3(s.U.momentum, g)}; for (int j = 0; j < 5; j++) v[j][i] = wdetJ * body_force[j]; + if (context->idl_enable) { + const CeedScalar sigma = LinearRampCoefficient(context->idl_amplitude, context->idl_length, context->idl_start, x_i[0]); + CeedScalar damp_state[5] = {s.Y.pressure - P0, 0, 0, 0, 0}, idl_residual[5] = {0.}; + InternalDampingLayer(context, s, sigma, damp_state, idl_residual); + for (int j = 0; j < 5; j++) v[j][i] -= wdetJ * idl_residual[j]; + } + // -- Stabilization method: none (Galerkin), SU, or SUPG CeedScalar Tau_d[3], stab[5][3], U_dot[5] = {0}; Tau_diagPrim(context, s, dXdx, dt, Tau_d); @@ -211,7 +228,7 @@ CEED_QFUNCTION_HELPER int IFunction_Newtonian(void *ctx, CeedInt Q, const CeedSc NewtonianIdealGasContext context = (NewtonianIdealGasContext)ctx; const CeedScalar *g = context->g; const CeedScalar dt = context->dt; - const CeedScalar P0 = context->P0; + const CeedScalar P0 = context->idl_pressure; CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { const CeedScalar qi[5] = {q[0][i], q[1][i], q[2][i], q[3][i], q[4][i]}; @@ -282,6 +299,10 @@ CEED_QFUNCTION(IFunction_Newtonian_Prim)(void *ctx, CeedInt Q, const CeedScalar return IFunction_Newtonian(ctx, Q, in, out, STATEVAR_PRIMITIVE); } +CEED_QFUNCTION(IFunction_Newtonian_Entropy)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { + return IFunction_Newtonian(ctx, Q, in, out, STATEVAR_ENTROPY); +} + // ***************************************************************************** // This QFunction implements the jacobian of the Navier-Stokes equations for implicit time stepping method. // ***************************************************************************** @@ -364,6 +385,10 @@ CEED_QFUNCTION(IJacobian_Newtonian_Prim)(void *ctx, CeedInt Q, const CeedScalar return IJacobian_Newtonian(ctx, Q, in, out, STATEVAR_PRIMITIVE); } +CEED_QFUNCTION(IJacobian_Newtonian_Entropy)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { + return IJacobian_Newtonian(ctx, Q, in, out, STATEVAR_ENTROPY); +} + // ***************************************************************************** // Compute boundary integral (ie. for strongly set inflows) // ***************************************************************************** @@ -418,6 +443,10 @@ CEED_QFUNCTION(BoundaryIntegral_Prim)(void *ctx, CeedInt Q, const CeedScalar *co return BoundaryIntegral(ctx, Q, in, out, STATEVAR_PRIMITIVE); } +CEED_QFUNCTION(BoundaryIntegral_Entropy)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { + return BoundaryIntegral(ctx, Q, in, out, STATEVAR_ENTROPY); +} + // ***************************************************************************** // Jacobian for "set nothing" boundary integral // ***************************************************************************** @@ -473,3 +502,7 @@ CEED_QFUNCTION(BoundaryIntegral_Jacobian_Conserv)(void *ctx, CeedInt Q, const Ce CEED_QFUNCTION(BoundaryIntegral_Jacobian_Prim)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { return BoundaryIntegral_Jacobian(ctx, Q, in, out, STATEVAR_PRIMITIVE); } + +CEED_QFUNCTION(BoundaryIntegral_Jacobian_Entropy)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { + return BoundaryIntegral_Jacobian(ctx, Q, in, out, STATEVAR_ENTROPY); +} diff --git a/examples/fluids/qfunctions/newtonian_state.h b/examples/fluids/qfunctions/newtonian_state.h index 185caf06d6..0b6796f2fc 100644 --- a/examples/fluids/qfunctions/newtonian_state.h +++ b/examples/fluids/qfunctions/newtonian_state.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -9,8 +9,10 @@ /// Structs and helper functions regarding the state of a newtonian simulation #pragma once -#include +#include +#ifndef CEED_RUNNING_JIT_PASS #include +#endif #include "newtonian_types.h" #include "utils.h" @@ -38,6 +40,12 @@ CEED_QFUNCTION_HELPER void UnpackState_Y(StatePrimitive s, CeedScalar Y[5]) { Y[4] = s.temperature; } +CEED_QFUNCTION_HELPER void UnpackState_V(StateEntropy s, CeedScalar V[5]) { + V[0] = s.S_density; + for (int i = 0; i < 3; i++) V[i + 1] = s.S_momentum[i]; + V[4] = s.S_energy; +} + CEED_QFUNCTION_HELPER CeedScalar HeatCapacityRatio(NewtonianIdealGasContext gas) { return gas->cp / gas->cv; } CEED_QFUNCTION_HELPER CeedScalar GasConstant(NewtonianIdealGasContext gas) { return gas->cp - gas->cv; } @@ -49,14 +57,12 @@ CEED_QFUNCTION_HELPER CeedScalar SoundSpeed(NewtonianIdealGasContext gas, CeedSc CEED_QFUNCTION_HELPER CeedScalar Mach(NewtonianIdealGasContext gas, CeedScalar T, CeedScalar u) { return u / SoundSpeed(gas, T); } CEED_QFUNCTION_HELPER CeedScalar TotalSpecificEnthalpy(NewtonianIdealGasContext gas, const State s) { - // Ignoring potential energy - CeedScalar e_internal = gas->cv * s.Y.temperature; CeedScalar e_kinetic = 0.5 * Dot3(s.Y.velocity, s.Y.velocity); + CeedScalar e_internal = gas->cv * s.Y.temperature; return e_internal + e_kinetic + s.Y.pressure / s.U.density; } CEED_QFUNCTION_HELPER CeedScalar TotalSpecificEnthalpy_fwd(NewtonianIdealGasContext gas, const State s, const State ds) { - // Ignoring potential energy CeedScalar de_kinetic = Dot3(ds.Y.velocity, s.Y.velocity); CeedScalar de_internal = gas->cv * ds.Y.temperature; return de_internal + de_kinetic + ds.Y.pressure / s.U.density - s.Y.pressure / Square(s.U.density) * ds.U.density; @@ -89,6 +95,63 @@ CEED_QFUNCTION_HELPER StatePrimitive StatePrimitiveFromConservative_fwd(Newtonia return dY; } +CEED_QFUNCTION_HELPER StateEntropy StateEntropyFromPrimitive(NewtonianIdealGasContext gas, StatePrimitive Y) { + StateEntropy V; + const CeedScalar gamma = HeatCapacityRatio(gas); + const CeedScalar rho = Y.pressure / (GasConstant(gas) * Y.temperature); + const CeedScalar entropy = log(Y.pressure) - gamma * log(rho); + const CeedScalar rho_div_p = rho / Y.pressure; + const CeedScalar e_kinetic = 0.5 * Dot3(Y.velocity, Y.velocity); + + V.S_density = (gamma - entropy) / (gamma - 1) - rho_div_p * e_kinetic; + for (int i = 0; i < 3; i++) V.S_momentum[i] = rho_div_p * Y.velocity[i]; + V.S_energy = -rho_div_p; + return V; +} + +CEED_QFUNCTION_HELPER StateEntropy StateEntropyFromPrimitive_fwd(NewtonianIdealGasContext gas, State s, StatePrimitive dY) { + StateEntropy dV; + const CeedScalar gamma = HeatCapacityRatio(gas); + CeedScalar drho = (dY.pressure * s.Y.temperature - s.Y.pressure * dY.temperature) / (GasConstant(gas) * s.Y.temperature * s.Y.temperature); + + const CeedScalar e_kinetic = .5 * Dot3(s.Y.velocity, s.Y.velocity); + const CeedScalar de_kinetic = Dot3(dY.velocity, s.Y.velocity); + const CeedScalar rho_div_p = s.U.density / s.Y.pressure; + const CeedScalar drho_div_p = (drho * s.Y.pressure - s.U.density * dY.pressure) / Square(s.Y.pressure); + + CeedScalar dentropy = dY.pressure / s.Y.pressure - gamma * drho / s.U.density; + + dV.S_density = -dentropy / (gamma - 1) - de_kinetic * rho_div_p - e_kinetic * drho_div_p; + for (CeedInt i = 0; i < 3; i++) dV.S_momentum[i] = rho_div_p * dY.velocity[i] + drho_div_p * s.Y.velocity[i]; + dV.S_energy = -drho_div_p; + return dV; +} + +CEED_QFUNCTION_HELPER StatePrimitive StatePrimitiveFromEntropy(NewtonianIdealGasContext gas, StateEntropy V) { + StatePrimitive Y; + for (int i = 0; i < 3; i++) Y.velocity[i] = -V.S_momentum[i] / V.S_energy; + Y.temperature = -1 / (GasConstant(gas) * V.S_energy); + const CeedScalar gamma = HeatCapacityRatio(gas); + const CeedScalar e_kinetic = 0.5 * Dot3(Y.velocity, Y.velocity); + const CeedScalar entropy = gamma - (gamma - 1) * (V.S_density - e_kinetic * V.S_energy); + const CeedScalar log_P = -(entropy + gamma * log(-V.S_energy)) / (gamma - 1); + Y.pressure = exp(log_P); + return Y; +} + +CEED_QFUNCTION_HELPER StatePrimitive StatePrimitiveFromEntropy_fwd(NewtonianIdealGasContext gas, State s, StateEntropy dV) { + StatePrimitive dY; + StateEntropy V = StateEntropyFromPrimitive(gas, s.Y); + for (int i = 0; i < 3; i++) dY.velocity[i] = -(dV.S_momentum[i] - V.S_momentum[i] * dV.S_energy / V.S_energy) / V.S_energy; + dY.temperature = dV.S_energy / (GasConstant(gas) * V.S_energy * V.S_energy); + const CeedScalar gamma = HeatCapacityRatio(gas); + const CeedScalar e_kinetic = 0.5 * Dot3(s.Y.velocity, s.Y.velocity); + const CeedScalar de_kinetic = Dot3(dY.velocity, s.Y.velocity); + const CeedScalar dentropy = (1 - gamma) * (dV.S_density - e_kinetic * dV.S_energy - de_kinetic * V.S_energy); + dY.pressure = s.Y.pressure * (-dentropy - gamma * dV.S_energy / V.S_energy) / (gamma - 1); + return dY; +} + CEED_QFUNCTION_HELPER StateConservative StateConservativeFromPrimitive(NewtonianIdealGasContext gas, StatePrimitive Y) { StateConservative U; U.density = Y.pressure / (GasConstant(gas) * Y.temperature); @@ -116,6 +179,77 @@ CEED_QFUNCTION_HELPER StateConservative StateConservativeFromPrimitive_fwd(Newto return dU; } +CEED_QFUNCTION_HELPER StateEntropy StateEntropyFromConservative(NewtonianIdealGasContext gas, StateConservative U) { + StateEntropy V; + const CeedScalar gamma = HeatCapacityRatio(gas); + const CeedScalar e_kinetic = .5 * Dot3(U.momentum, U.momentum) / U.density; + const CeedScalar e_internal = U.E_total - e_kinetic; + const CeedScalar p = (gamma - 1) * e_internal; + const CeedScalar entropy = log(p) - gamma * log(U.density); + + V.S_density = (gamma - entropy) / (gamma - 1) - e_kinetic / p; + for (int i = 0; i < 3; i++) V.S_momentum[i] = U.momentum[i] / p; + V.S_energy = -U.density / p; + return V; +} + +CEED_QFUNCTION_HELPER StateEntropy StateEntropyFromConservative_fwd(NewtonianIdealGasContext gas, State s, StateConservative dU) { + StateEntropy dV; + const CeedScalar gamma = HeatCapacityRatio(gas); + const CeedScalar e_kinetic = .5 * Dot3(s.U.momentum, s.U.momentum) / s.U.density; + const CeedScalar de_kinetic = (Dot3(s.U.momentum, dU.momentum) - e_kinetic * dU.density) / s.U.density; + const CeedScalar de_internal = dU.E_total - de_kinetic; + const CeedScalar p = s.Y.pressure; + const CeedScalar dp = (gamma - 1) * de_internal; + + CeedScalar dentropy = dp / p - gamma * dU.density / s.U.density; + + dV.S_density = -dentropy / (gamma - 1) - de_kinetic / p + dp * e_kinetic / Square(p); + for (CeedInt i = 0; i < 3; i++) { + dV.S_momentum[i] = (dU.momentum[i] - s.U.momentum[i] * dp / p) / p; + } + dV.S_energy = -(dU.density - s.U.density * dp / p) / p; + return dV; +} + +CEED_QFUNCTION_HELPER StateConservative StateConservativeFromEntropy(NewtonianIdealGasContext gas, StateEntropy V) { + StateConservative U; + CeedScalar velocity[3]; + for (int i = 0; i < 3; i++) velocity[i] = -V.S_momentum[i] / V.S_energy; + const CeedScalar gamma = HeatCapacityRatio(gas); + const CeedScalar e_kinetic = 0.5 * Dot3(velocity, velocity); + const CeedScalar entropy = gamma - (gamma - 1) * (V.S_density - e_kinetic * V.S_energy); + const CeedScalar log_rho = -(entropy + log(-V.S_energy)) / (gamma - 1); + U.density = exp(log_rho); + for (int i = 0; i < 3; i++) U.momentum[i] = U.density * velocity[i]; + + const CeedScalar e_internal = -gas->cv / (GasConstant(gas) * V.S_energy); + U.E_total = U.density * (e_internal + e_kinetic); + return U; +} + +CEED_QFUNCTION_HELPER StateConservative StateConservativeFromEntropy_fwd(NewtonianIdealGasContext gas, State s, StateEntropy dV) { + StateConservative dU; + CeedScalar dvelocity[3]; + StateEntropy V = StateEntropyFromPrimitive(gas, s.Y); + for (int i = 0; i < 3; i++) dvelocity[i] = (-dV.S_momentum[i] - s.Y.velocity[i] * dV.S_energy) / V.S_energy; + const CeedScalar gamma = HeatCapacityRatio(gas); + const CeedScalar e_kinetic = 0.5 * Dot3(s.Y.velocity, s.Y.velocity); + const CeedScalar de_kinetic = Dot3(dvelocity, s.Y.velocity); + const CeedScalar entropy = gamma - (gamma - 1) * (V.S_density - e_kinetic * V.S_energy); + const CeedScalar dentropy = -(gamma - 1) * (dV.S_density - (de_kinetic * V.S_energy + e_kinetic * dV.S_energy)); + const CeedScalar log_rho = -(entropy + log(-V.S_energy)) / (gamma - 1); + const CeedScalar rho = exp(log_rho); + dU.density = -rho / (gamma - 1) * (dentropy + dV.S_energy / V.S_energy); + for (int i = 0; i < 3; i++) dU.momentum[i] = dU.density * s.Y.velocity[i] + s.U.density * dvelocity[i]; + + const CeedScalar e_internal = -gas->cv / (GasConstant(gas) * V.S_energy); + const CeedScalar de_internal = gas->cv * dV.S_energy / (GasConstant(gas) * V.S_energy * V.S_energy); + const CeedScalar e_total = e_internal + e_kinetic; + dU.E_total = dU.density * e_total + s.U.density * (de_internal + de_kinetic); + return dU; +} + CEED_QFUNCTION_HELPER State StateFromPrimitive(NewtonianIdealGasContext gas, StatePrimitive Y) { StateConservative U = StateConservativeFromPrimitive(gas, Y); State s; @@ -156,6 +290,11 @@ CEED_QFUNCTION_HELPER void StateToU(NewtonianIdealGasContext gas, const State in CEED_QFUNCTION_HELPER void StateToY(NewtonianIdealGasContext gas, const State input, CeedScalar Y[5]) { UnpackState_Y(input.Y, Y); } +CEED_QFUNCTION_HELPER void StateToV(NewtonianIdealGasContext gas, const State input, CeedScalar V[5]) { + StateEntropy state_V = StateEntropyFromPrimitive(gas, input.Y); + UnpackState_V(state_V, V); +} + CEED_QFUNCTION_HELPER void StateToQ(NewtonianIdealGasContext gas, const State input, CeedScalar Q[5], StateVariable state_var) { switch (state_var) { case STATEVAR_CONSERVATIVE: @@ -164,6 +303,25 @@ CEED_QFUNCTION_HELPER void StateToQ(NewtonianIdealGasContext gas, const State in case STATEVAR_PRIMITIVE: StateToY(gas, input, Q); break; + case STATEVAR_ENTROPY: + StateToV(gas, input, Q); + break; + } +} + +CEED_QFUNCTION_HELPER void StateToQ_fwd(NewtonianIdealGasContext gas, const State input, const State dinput, CeedScalar dQ[5], + StateVariable state_var) { + switch (state_var) { + case STATEVAR_CONSERVATIVE: + case STATEVAR_PRIMITIVE: + StateToQ(gas, dinput, dQ, state_var); + break; + case STATEVAR_ENTROPY: { + StateEntropy dstate_v; + + dstate_v = StateEntropyFromPrimitive_fwd(gas, input, dinput.Y); + UnpackState_V(dstate_v, dQ); + } break; } } @@ -211,6 +369,32 @@ CEED_QFUNCTION_HELPER State StateFromY_fwd(NewtonianIdealGasContext gas, State s return ds; } +CEED_QFUNCTION_HELPER State StateFromV(NewtonianIdealGasContext gas, const CeedScalar V[5]) { + State s; + StateEntropy state_V; + state_V.S_density = V[0]; + state_V.S_momentum[0] = V[1]; + state_V.S_momentum[1] = V[2]; + state_V.S_momentum[2] = V[3]; + state_V.S_energy = V[4]; + s.U = StateConservativeFromEntropy(gas, state_V); + s.Y = StatePrimitiveFromEntropy(gas, state_V); + return s; +} + +CEED_QFUNCTION_HELPER State StateFromV_fwd(NewtonianIdealGasContext gas, State s, const CeedScalar dV[5]) { + State ds; + StateEntropy state_dV; + state_dV.S_density = dV[0]; + state_dV.S_momentum[0] = dV[1]; + state_dV.S_momentum[1] = dV[2]; + state_dV.S_momentum[2] = dV[3]; + state_dV.S_energy = dV[4]; + ds.U = StateConservativeFromEntropy_fwd(gas, s, state_dV); + ds.Y = StatePrimitiveFromEntropy_fwd(gas, s, state_dV); + return ds; +} + CEED_QFUNCTION_HELPER State StateFromQ(NewtonianIdealGasContext gas, const CeedScalar Q[5], StateVariable state_var) { State s; switch (state_var) { @@ -220,6 +404,9 @@ CEED_QFUNCTION_HELPER State StateFromQ(NewtonianIdealGasContext gas, const CeedS case STATEVAR_PRIMITIVE: s = StateFromY(gas, Q); break; + case STATEVAR_ENTROPY: + s = StateFromV(gas, Q); + break; } return s; } @@ -233,6 +420,9 @@ CEED_QFUNCTION_HELPER State StateFromQ_fwd(NewtonianIdealGasContext gas, State s case STATEVAR_PRIMITIVE: ds = StateFromY_fwd(gas, s, dQ); break; + case STATEVAR_ENTROPY: + ds = StateFromV_fwd(gas, s, dQ); + break; } return ds; } diff --git a/examples/fluids/qfunctions/newtonian_types.h b/examples/fluids/qfunctions/newtonian_types.h index 3a5402c36d..70b2b4c3bd 100644 --- a/examples/fluids/qfunctions/newtonian_types.h +++ b/examples/fluids/qfunctions/newtonian_types.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -6,13 +6,17 @@ // This file is part of CEED: http://github.com/ceed #pragma once -#include +#include +#ifndef CEED_RUNNING_JIT_PASS +#include +#endif #include "stabilization_types.h" typedef enum { STATEVAR_CONSERVATIVE = 0, STATEVAR_PRIMITIVE = 1, + STATEVAR_ENTROPY = 2, } StateVariable; typedef struct NewtonianIdealGasContext_ *NewtonianIdealGasContext; @@ -32,11 +36,11 @@ struct NewtonianIdealGasContext_ { CeedScalar dt; CeedScalar time; CeedScalar ijacobian_time_shift; - CeedScalar P0; bool is_implicit; StateVariable state_var; StabilizationType stabilization; bool idl_enable; + CeedScalar idl_pressure; CeedScalar idl_amplitude; CeedScalar idl_start; CeedScalar idl_length; @@ -48,6 +52,12 @@ typedef struct { CeedScalar temperature; } StatePrimitive; +typedef struct { + CeedScalar S_density; + CeedScalar S_momentum[3]; + CeedScalar S_energy; +} StateEntropy; + typedef struct SetupContext_ *SetupContext; struct SetupContext_ { StatePrimitive reference; diff --git a/examples/fluids/qfunctions/riemann_solver.h b/examples/fluids/qfunctions/riemann_solver.h index 8ab0570504..7d884e9ad1 100644 --- a/examples/fluids/qfunctions/riemann_solver.h +++ b/examples/fluids/qfunctions/riemann_solver.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -84,13 +84,11 @@ CEED_QFUNCTION_HELPER StateConservative Flux_HLL_fwd(State left, State right, St UnpackState_U(dflux_left, dF_l); UnpackState_U(dflux_right, dF_r); for (int i = 0; i < 5; i++) { - const CeedScalar U_diff = U_r[i] - U_l[i]; - const CeedScalar S_diff = S_r - S_l; - const CeedScalar F_hll_denom = S_r * F_l[i] - S_l * F_r[i] + S_l * S_r * U_diff; + const CeedScalar S_diff = S_r - S_l; - dF_hll[i] += ((F_l[i] + S_r * U_diff) * S_diff - F_hll_denom) / Square(S_diff) * dS_r; - dF_hll[i] += ((-F_r[i] + S_r * U_diff) * S_diff + F_hll_denom) / Square(S_diff) * dS_l; - dF_hll[i] += (S_r * dF_l[i] - S_l * dF_r[i] + S_r * S_l * dU_r[i] - S_r * S_l * dU_l[i]) / S_diff; + dF_hll[i] += (S_l * (-F_l[i] + F_r[i] + S_l * U_l[i] - S_l * U_r[i]) / Square(S_diff)) * dS_r; + dF_hll[i] += (S_r * (F_l[i] - F_r[i] - S_r * U_l[i] + S_r * U_r[i]) / Square(S_diff)) * dS_l; + dF_hll[i] += (S_r * dF_l[i] - S_l * dF_r[i] + S_r * S_l * (dU_r[i] - dU_l[i])) / S_diff; } StateConservative dF = { dF_hll[0], @@ -110,7 +108,6 @@ CEED_QFUNCTION_HELPER void ComputeHLLSpeeds_Roe(NewtonianIdealGasContext gas, St // Stability requires that these speed estimates are *at least* as fast as the physical wave speeds. CeedScalar u_roe = RoeAverage(r, u_left, u_right); - // TODO: revisit this for gravity CeedScalar H_left = TotalSpecificEnthalpy(gas, left); CeedScalar H_right = TotalSpecificEnthalpy(gas, right); CeedScalar H_roe = RoeAverage(r, H_left, H_right); @@ -142,7 +139,8 @@ CEED_QFUNCTION_HELPER void ComputeHLLSpeeds_Roe_fwd(NewtonianIdealGasContext gas CeedScalar H_roe = RoeAverage(r, H_left, H_right); CeedScalar dH_roe = RoeAverage_fwd(r, dr, H_left, H_right, dH_left, dH_right); CeedScalar a_roe = sqrt((gamma - 1) * (H_roe - 0.5 * Square(u_roe))); - CeedScalar da_roe = 0.5 * (gamma - 1) / sqrt(H_roe) * dH_roe - 0.5 * sqrt(gamma - 1) * u_roe / sqrt(H_roe - 0.5 * Square(u_roe)) * du_roe; + CeedScalar da_roe = 0.5 * sqrt((gamma - 1) / (H_roe - 0.5 * Square(u_roe))) * dH_roe; // (da/dH) dH + da_roe -= 0.5 * sqrt(gamma - 1) * u_roe / sqrt(H_roe - 0.5 * Square(u_roe)) * du_roe; // (da/du) du *s_left = u_roe - a_roe; *ds_left = du_roe - da_roe; diff --git a/examples/fluids/qfunctions/setupgeo.h b/examples/fluids/qfunctions/setupgeo.h index a4d5181ad7..62b8390376 100644 --- a/examples/fluids/qfunctions/setupgeo.h +++ b/examples/fluids/qfunctions/setupgeo.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -7,8 +7,10 @@ /// @file /// Geometric factors (3D) for Navier-Stokes example using PETSc -#include +#include +#ifndef CEED_RUNNING_JIT_PASS #include +#endif #include "setupgeo_helpers.h" #include "utils.h" diff --git a/examples/fluids/qfunctions/setupgeo2d.h b/examples/fluids/qfunctions/setupgeo2d.h index c01753b2c1..0cd5649296 100644 --- a/examples/fluids/qfunctions/setupgeo2d.h +++ b/examples/fluids/qfunctions/setupgeo2d.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -7,8 +7,8 @@ /// @file /// Geometric factors (2D) for Navier-Stokes example using PETSc -#include -#include +#include + #include "setupgeo_helpers.h" #include "utils.h" @@ -98,3 +98,57 @@ CEED_QFUNCTION(SetupBoundary2d)(void *ctx, CeedInt Q, const CeedScalar *const *i } return 0; } + +// ***************************************************************************** +// This QFunction sets up the geometric factor required for integration when reference coordinates are in 2D and the physical coordinates are in 3D +// +// Reference (parent) 2D coordinates: X +// Physical (current) 3D coordinates: x +// Change of coordinate matrix: +// dxdX_{i,j} = dx_i/dX_j (indicial notation) [3 * 2] +// Inverse change of coordinate matrix: +// dXdx_{i,j} = dX_i/dx_j (indicial notation) [2 * 3] +// +// (J1,J2,J3) is given by the cross product of the columns of dxdX_{i,j} +// +// detJb is the magnitude of (J1,J2,J3) +// +// dXdx is calculated via Moore–Penrose inverse: +// +// dX_i/dx_j = (dxdX^T dxdX)^(-1) dxdX +// = (dx_l/dX_i * dx_l/dX_k)^(-1) dx_j/dX_k +// +// All quadrature data is stored in 10 field vector of quadrature data. +// +// We require the determinant of the Jacobian to properly compute integrals of +// the form: int( u v ) +// +// Stored: w detJb +// in q_data_sur[0] +// +// Normal vector = (J1,J2,J3) / detJb +// +// Stored: (J1,J2,J3) / detJb +// +// Stored: dXdx_{i,j} +// in q_data_sur[1:6] as +// [dXdx_11 dXdx_12 dXdx_13] +// [dXdx_21 dXdx_22 dXdx_23] +// ***************************************************************************** +CEED_QFUNCTION(Setup2D_3Dcoords)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { + const CeedScalar(*J)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[0]; + const CeedScalar(*w) = in[1]; + CeedScalar(*q_data_sur) = out[0]; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { + CeedScalar detJb, normal[3], dXdx[2][3]; + + NormalVectorFromdxdX_3D(Q, i, J, normal, &detJb); + InvertBoundaryMappingJacobian_3D(Q, i, J, dXdx); + const CeedScalar wdetJ = w[i] * detJb; + + StoredValuesPack(Q, i, 0, 1, &wdetJ, q_data_sur); + StoredValuesPack(Q, i, 1, 6, (const CeedScalar *)dXdx, q_data_sur); + } + return 0; +} diff --git a/examples/fluids/qfunctions/setupgeo_helpers.h b/examples/fluids/qfunctions/setupgeo_helpers.h index 930ff7bb72..6677225f4f 100644 --- a/examples/fluids/qfunctions/setupgeo_helpers.h +++ b/examples/fluids/qfunctions/setupgeo_helpers.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -9,8 +9,10 @@ /// Geometric factors (3D) for Navier-Stokes example using PETSc #pragma once -#include +#include +#ifndef CEED_RUNNING_JIT_PASS #include +#endif #include "utils.h" diff --git a/examples/fluids/qfunctions/sgs_dd_model.h b/examples/fluids/qfunctions/sgs_dd_model.h deleted file mode 100644 index da1a8f7967..0000000000 --- a/examples/fluids/qfunctions/sgs_dd_model.h +++ /dev/null @@ -1,245 +0,0 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. -// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. -// -// SPDX-License-Identifier: BSD-2-Clause -// -// This file is part of CEED: http://github.com/ceed - -/// @file -/// Structs and helper functions to evaluate data-driven subgrid-stress modeling -/// See 'Invariant data-driven subgrid stress modeling in the strain-rate eigenframe for large eddy simulation' 2022 and 'S-frame discrepancy -/// correction models for data-informed Reynolds stress closure' 2022 -#include - -#include "newtonian_state.h" -#include "newtonian_types.h" -#include "sgs_dd_utils.h" -#include "utils.h" -#include "utils_eigensolver_jacobi.h" - -typedef struct SgsDDContext_ *SgsDDContext; -struct SgsDDContext_ { - CeedInt num_inputs, num_outputs; - CeedInt num_layers; - CeedInt num_neurons; - CeedScalar alpha; - - struct NewtonianIdealGasContext_ gas; - struct { - size_t bias1, bias2; - size_t weight1, weight2; - size_t out_scaling; - } offsets; - size_t total_bytes; - CeedScalar data[1]; -}; - -CEED_QFUNCTION_HELPER void LeakyReLU(CeedScalar *x, const CeedScalar alpha, const CeedInt N) { - for (CeedInt i = 0; i < N; i++) x[i] *= (x[i] < 0 ? alpha : 1.); -} - -CEED_QFUNCTION_HELPER void DataDrivenInference(const CeedScalar *inputs, CeedScalar *outputs, SgsDDContext sgsdd_ctx) { - const CeedInt num_neurons = sgsdd_ctx->num_neurons; - const CeedInt num_inputs = sgsdd_ctx->num_inputs; - const CeedInt num_outputs = sgsdd_ctx->num_outputs; - const CeedScalar alpha = sgsdd_ctx->alpha; - const CeedScalar *bias1 = &sgsdd_ctx->data[sgsdd_ctx->offsets.bias1]; - const CeedScalar *bias2 = &sgsdd_ctx->data[sgsdd_ctx->offsets.bias2]; - const CeedScalar *weight1 = &sgsdd_ctx->data[sgsdd_ctx->offsets.weight1]; - const CeedScalar *weight2 = &sgsdd_ctx->data[sgsdd_ctx->offsets.weight2]; - CeedScalar V[20] = {0.}; - - CopyN(bias1, V, num_neurons); - MatVecNM(weight1, inputs, num_neurons, num_inputs, CEED_NOTRANSPOSE, V); - LeakyReLU(V, alpha, num_neurons); - CopyN(bias2, outputs, num_outputs); - MatVecNM(weight2, V, num_outputs, num_neurons, CEED_NOTRANSPOSE, outputs); -} - -CEED_QFUNCTION_HELPER void ComputeSgsDD_Fused(const CeedScalar grad_velo_aniso[3][3], const CeedScalar km_A_ij[6], const CeedScalar delta, - const CeedScalar viscosity, CeedScalar kmsgs_stress[6], SgsDDContext sgsdd_ctx) { - CeedScalar inputs[6], grad_velo_magnitude, eigenvectors[3][3], sgs_sframe_sym[6] = {0.}, new_bounds[6][2]; - // Copying new_bounds because Sycl online compiler doesn't like direct casting the pointer - CopyN(&sgsdd_ctx->data[sgsdd_ctx->offsets.out_scaling], (CeedScalar *)new_bounds, 12); - - ComputeSgsDDInputs(grad_velo_aniso, km_A_ij, delta, viscosity, eigenvectors, inputs, &grad_velo_magnitude); - DataDrivenInference(inputs, sgs_sframe_sym, sgsdd_ctx); - ComputeSgsDDOutputs(sgs_sframe_sym, delta, eigenvectors, new_bounds, grad_velo_magnitude, kmsgs_stress); -} - -// @brief Calculate subgrid stress at nodes using anisotropic data-driven model -CEED_QFUNCTION_HELPER int ComputeSgsDDNodal_Fused(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out, - StateVariable state_var) { - const CeedScalar(*q)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[0]; - const CeedScalar(*grad_velo)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[2]; - const CeedScalar(*A_ij_delta)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[3]; - const CeedScalar(*inv_multiplicity) = (const CeedScalar(*))in[4]; - CeedScalar(*v)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[0]; - - const SgsDDContext sgsdd_ctx = (SgsDDContext)ctx; - const NewtonianIdealGasContext gas = &sgsdd_ctx->gas; - - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { - const CeedScalar qi[5] = {q[0][i], q[1][i], q[2][i], q[3][i], q[4][i]}; - const CeedScalar grad_velo_aniso[3][3] = { - {grad_velo[0][0][i], grad_velo[0][1][i], grad_velo[0][2][i]}, - {grad_velo[1][0][i], grad_velo[1][1][i], grad_velo[1][2][i]}, - {grad_velo[2][0][i], grad_velo[2][1][i], grad_velo[2][2][i]} - }; - const CeedScalar km_A_ij[6] = {A_ij_delta[0][i], A_ij_delta[1][i], A_ij_delta[2][i], A_ij_delta[3][i], A_ij_delta[4][i], A_ij_delta[5][i]}; - const CeedScalar delta = A_ij_delta[6][i]; - const State s = StateFromQ(gas, qi, state_var); - CeedScalar km_sgs[6]; - - ComputeSgsDD_Fused(grad_velo_aniso, km_A_ij, delta, gas->mu / s.U.density, km_sgs, sgsdd_ctx); - - for (int j = 0; j < 6; j++) v[j][i] = inv_multiplicity[i] * km_sgs[j]; - } - return 0; -} - -CEED_QFUNCTION(ComputeSgsDDNodal_Prim)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { - return ComputeSgsDDNodal_Fused(ctx, Q, in, out, STATEVAR_PRIMITIVE); -} - -CEED_QFUNCTION(ComputeSgsDDNodal_Conserv)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { - return ComputeSgsDDNodal_Fused(ctx, Q, in, out, STATEVAR_CONSERVATIVE); -} - -// @brief Calculate inputs to anisotropic data-driven model -CEED_QFUNCTION_HELPER int ComputeSgsDDNodal_Sequential_Inputs(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out, - StateVariable state_var) { - const CeedScalar(*q)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[0]; - const CeedScalar(*grad_velo)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[1]; - const CeedScalar(*A_ij_delta)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[2]; - const CeedScalar(*inv_multiplicity) = (const CeedScalar(*))in[3]; - CeedScalar(*eigenvectors_stored) = out[0]; - CeedScalar(*model_inputs)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[1]; - - const SgsDDContext sgsdd_ctx = (SgsDDContext)ctx; - const NewtonianIdealGasContext gas = &sgsdd_ctx->gas; - - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { - const CeedScalar qi[5] = {q[0][i], q[1][i], q[2][i], q[3][i], q[4][i]}; - const CeedScalar grad_velo_aniso[3][3] = { - {grad_velo[0][0][i], grad_velo[0][1][i], grad_velo[0][2][i]}, - {grad_velo[1][0][i], grad_velo[1][1][i], grad_velo[1][2][i]}, - {grad_velo[2][0][i], grad_velo[2][1][i], grad_velo[2][2][i]} - }; - const CeedScalar km_A_ij[6] = {A_ij_delta[0][i], A_ij_delta[1][i], A_ij_delta[2][i], A_ij_delta[3][i], A_ij_delta[4][i], A_ij_delta[5][i]}; - const CeedScalar delta = A_ij_delta[6][i]; - const State s = StateFromQ(gas, qi, state_var); - - CeedScalar model_inputs_i[6], grad_velo_magnitude, eigenvectors[3][3]; - ComputeSgsDDInputs(grad_velo_aniso, km_A_ij, delta, gas->mu / s.U.density, eigenvectors, model_inputs_i, &grad_velo_magnitude); - - ScaleN(model_inputs_i, inv_multiplicity[i], 6); - StoredValuesPack(Q, i, 0, 6, model_inputs_i, (CeedScalar *)model_inputs); - StoredValuesPack(Q, i, 0, 9, (const CeedScalar *)eigenvectors, eigenvectors_stored); - StoredValuesPack(Q, i, 9, 1, &grad_velo_magnitude, eigenvectors_stored); - } - return CEED_ERROR_SUCCESS; -} - -CEED_QFUNCTION(ComputeSgsDDNodal_Sequential_Inputs_Prim)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { - return ComputeSgsDDNodal_Sequential_Inputs(ctx, Q, in, out, STATEVAR_PRIMITIVE); -} - -CEED_QFUNCTION(ComputeSgsDDNodal_Sequential_Inputs_Conserv)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { - return ComputeSgsDDNodal_Sequential_Inputs(ctx, Q, in, out, STATEVAR_CONSERVATIVE); -} - -// @brief Runs inference on the data-driven model, used predominantsly for testing and validation -CEED_QFUNCTION(ComputeSgsDDNodal_Sequential_Inference)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { - const CeedScalar(*model_inputs) = in[0]; - const CeedScalar(*inv_multiplicity) = in[1]; - CeedScalar(*model_outputs) = out[0]; - - const SgsDDContext sgsdd_ctx = (SgsDDContext)ctx; - - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { - CeedScalar model_inputs_i[6], model_outputs_i[6]; - - StoredValuesUnpack(Q, i, 0, 6, (const CeedScalar *)model_inputs, model_inputs_i); - DataDrivenInference(model_inputs_i, model_outputs_i, sgsdd_ctx); - ScaleN(model_outputs_i, inv_multiplicity[i], 6); - StoredValuesPack(Q, i, 0, 6, model_outputs_i, model_outputs); - } - return CEED_ERROR_SUCCESS; -} - -// @brief Calculates SGS from outputs of anisotropic data-driven model -CEED_QFUNCTION(ComputeSgsDDNodal_Sequential_Outputs)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { - const CeedScalar(*model_outputs) = in[0]; - const CeedScalar(*A_ij_delta)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[1]; - const CeedScalar(*inv_multiplicity) = (const CeedScalar(*))in[2]; - const CeedScalar(*eigenvectors_stored) = in[3]; - CeedScalar(*kmsgs_stress)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[0]; - - const SgsDDContext sgsdd_ctx = (SgsDDContext)ctx; - CeedScalar new_bounds[6][2]; - CopyN(&sgsdd_ctx->data[sgsdd_ctx->offsets.out_scaling], (CeedScalar *)new_bounds, 12); - - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { - CeedScalar model_outputs_i[6]; - const CeedScalar delta = A_ij_delta[6][i]; - - StoredValuesUnpack(Q, i, 0, 6, model_outputs, model_outputs_i); - CeedScalar grad_velo_magnitude, eigenvectors[3][3], kmsgs_stress_i[6]; - StoredValuesUnpack(Q, i, 0, 9, eigenvectors_stored, (CeedScalar *)eigenvectors); - StoredValuesUnpack(Q, i, 9, 1, eigenvectors_stored, &grad_velo_magnitude); - ComputeSgsDDOutputs(model_outputs_i, delta, eigenvectors, new_bounds, grad_velo_magnitude, kmsgs_stress_i); - - for (int j = 0; j < 6; j++) kmsgs_stress[j][i] = inv_multiplicity[i] * kmsgs_stress_i[j]; - } - return CEED_ERROR_SUCCESS; -} - -// @brief Adds subgrid stress to residual (during IFunction evaluation) -CEED_QFUNCTION_HELPER int FluxSubgridStress(const StatePrimitive Y, const CeedScalar km_sgs[6], CeedScalar Flux[5][3]) { - CeedScalar sgs[3][3]; - - KMUnpack(km_sgs, sgs); - for (CeedInt j = 0; j < 3; j++) { - Flux[0][j] = 0.; - for (CeedInt k = 0; k < 3; k++) Flux[k + 1][j] = sgs[k][j]; - Flux[4][j] = Y.velocity[0] * sgs[0][j] + Y.velocity[1] * sgs[1][j] + Y.velocity[2] * sgs[2][j]; - } - return 0; -} - -CEED_QFUNCTION_HELPER int IFunction_NodalSgs(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out, StateVariable state_var) { - const CeedScalar(*q)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[0]; - const CeedScalar(*q_data) = in[1]; - const CeedScalar(*km_sgs)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[2]; - CeedScalar(*Grad_v)[5][CEED_Q_VLA] = (CeedScalar(*)[5][CEED_Q_VLA])out[0]; - - NewtonianIdealGasContext gas = (NewtonianIdealGasContext)ctx; - - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { - const CeedScalar qi[5] = {q[0][i], q[1][i], q[2][i], q[3][i], q[4][i]}; - const State s = StateFromQ(gas, qi, state_var); - - CeedScalar wdetJ, dXdx[3][3]; - QdataUnpack_3D(Q, i, q_data, &wdetJ, dXdx); - - CeedScalar Flux[5][3]; - const CeedScalar km_sgs_i[6] = {km_sgs[0][i], km_sgs[1][i], km_sgs[2][i], km_sgs[3][i], km_sgs[4][i], km_sgs[5][i]}; - FluxSubgridStress(s.Y, km_sgs_i, Flux); - - for (CeedInt k = 0; k < 3; k++) { - for (CeedInt j = 0; j < 5; j++) { - Grad_v[k][j][i] = -wdetJ * (dXdx[k][0] * Flux[j][0] + dXdx[k][1] * Flux[j][1] + dXdx[k][2] * Flux[j][2]); - } - } - } - return 0; -} - -CEED_QFUNCTION(IFunction_NodalSgs_Conserv)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { - return IFunction_NodalSgs(ctx, Q, in, out, STATEVAR_CONSERVATIVE); -} - -CEED_QFUNCTION(IFunction_NodalSgs_Prim)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { - return IFunction_NodalSgs(ctx, Q, in, out, STATEVAR_PRIMITIVE); -} diff --git a/examples/fluids/qfunctions/sgs_dd_training.h b/examples/fluids/qfunctions/sgs_dd_training.h deleted file mode 100644 index 803f959a1d..0000000000 --- a/examples/fluids/qfunctions/sgs_dd_training.h +++ /dev/null @@ -1,68 +0,0 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. -// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. -// -// SPDX-License-Identifier: BSD-2-Clause -// -// This file is part of CEED: http://github.com/ceed - -/// @file -/// Structs and helper functions for training data-driven subgrid-stress models -/// See 'Invariant data-driven subgrid stress modeling in the strain-rate eigenframe for large eddy simulation' 2022 and 'S-frame discrepancy -/// correction models for data-informed Reynolds stress closure' 2022 -#include - -#include "differential_filter_enums.h" -#include "newtonian_state.h" -#include "newtonian_types.h" -#include "sgs_dd_utils.h" -#include "utils.h" -#include "utils_eigensolver_jacobi.h" - -typedef struct SGS_DD_TrainingContext_ *SGS_DDTrainingContext; -struct SGS_DD_TrainingContext_ { - struct NewtonianIdealGasContext_ gas; -}; - -// @brief Calculate Data-Driven SGS model training data at nodes -CEED_QFUNCTION_HELPER int ComputeSGS_DDAnisotropicTrainingDataNodal(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out, - StateVariable state_var) { - const CeedScalar(*q)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[0]; - const CeedScalar(*velo_prod)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[1]; - const CeedScalar(*grad_velo)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[2]; - const CeedScalar(*A_ij_delta)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[3]; - const CeedScalar(*inv_multiplicity) = (const CeedScalar(*))in[4]; - CeedScalar(*v)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[0]; - - const SGS_DDTrainingContext sgsdd_ctx = (SGS_DDTrainingContext)ctx; - const NewtonianIdealGasContext gas = &sgsdd_ctx->gas; - - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { - const CeedScalar qi[5] = {q[0][i], q[1][i], q[2][i], q[3][i], q[4][i]}; - const CeedScalar grad_velo_aniso[3][3] = { - {grad_velo[0][0][i], grad_velo[0][1][i], grad_velo[0][2][i]}, - {grad_velo[1][0][i], grad_velo[1][1][i], grad_velo[1][2][i]}, - {grad_velo[2][0][i], grad_velo[2][1][i], grad_velo[2][2][i]} - }; - const CeedScalar km_A_ij[6] = {A_ij_delta[0][i], A_ij_delta[1][i], A_ij_delta[2][i], A_ij_delta[3][i], A_ij_delta[4][i], A_ij_delta[5][i]}; - const CeedScalar delta = A_ij_delta[6][i]; - const State s = StateFromQ(gas, qi, state_var); - CeedScalar inputs[6]; - CeedScalar eigenvectors[3][3], grad_velo_magnitude; // dummy variables, don't actually use them - - ComputeSgsDDInputs(grad_velo_aniso, km_A_ij, delta, gas->mu / s.U.density, eigenvectors, inputs, &grad_velo_magnitude); - - for (int j = 0; j < 6; j++) v[j][i] = inv_multiplicity[i] * inputs[j]; - - v[0 + 6][i] = (velo_prod[DIFF_FILTER_VELOCITY_SQUARED_XX][i] - Square(s.Y.velocity[0])) * inv_multiplicity[i]; - v[1 + 6][i] = (velo_prod[DIFF_FILTER_VELOCITY_SQUARED_YY][i] - Square(s.Y.velocity[1])) * inv_multiplicity[i]; - v[2 + 6][i] = (velo_prod[DIFF_FILTER_VELOCITY_SQUARED_ZZ][i] - Square(s.Y.velocity[2])) * inv_multiplicity[i]; - v[3 + 6][i] = (velo_prod[DIFF_FILTER_VELOCITY_SQUARED_YZ][i] - s.Y.velocity[1] * s.Y.velocity[2]) * inv_multiplicity[i]; - v[4 + 6][i] = (velo_prod[DIFF_FILTER_VELOCITY_SQUARED_XZ][i] - s.Y.velocity[0] * s.Y.velocity[2]) * inv_multiplicity[i]; - v[5 + 6][i] = (velo_prod[DIFF_FILTER_VELOCITY_SQUARED_XY][i] - s.Y.velocity[0] * s.Y.velocity[1]) * inv_multiplicity[i]; - } - return 0; -} - -CEED_QFUNCTION(ComputeSGS_DDAnisotropicTrainingDataNodal_Prim)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { - return ComputeSGS_DDAnisotropicTrainingDataNodal(ctx, Q, in, out, STATEVAR_PRIMITIVE); -} diff --git a/examples/fluids/qfunctions/sgs_dd_utils.h b/examples/fluids/qfunctions/sgs_dd_utils.h deleted file mode 100644 index 4bcb9fc181..0000000000 --- a/examples/fluids/qfunctions/sgs_dd_utils.h +++ /dev/null @@ -1,131 +0,0 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. -// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. -// -// SPDX-License-Identifier: BSD-2-Clause -// -// This file is part of CEED: http://github.com/ceed - -/// @file -/// Structs and helper functions for data-driven subgrid-stress modeling -/// See 'Invariant data-driven subgrid stress modeling in the strain-rate eigenframe for large eddy simulation' 2022 and 'S-frame discrepancy -/// correction models for data-informed Reynolds stress closure' 2022 -#pragma once - -#include - -#include "newtonian_state.h" -#include "newtonian_types.h" -#include "utils.h" -#include "utils_eigensolver_jacobi.h" - -// @brief Calculate Frobenius norm of velocity gradient from eigenframe quantities -CEED_QFUNCTION_HELPER CeedScalar VelocityGradientMagnitude(const CeedScalar strain_sframe[3], const CeedScalar vorticity_sframe[3]) { - return sqrt(Dot3(strain_sframe, strain_sframe) + 0.5 * Dot3(vorticity_sframe, vorticity_sframe)); -}; - -// @brief Change the order of basis vectors so that they align with vector and obey right-hand rule -// @details The e_1 and e_3 basis vectors are the closest aligned to the vector. The e_2 is set via e_3 x e_1 -// The basis vectors are assumed to form the rows of the basis matrix. -CEED_QFUNCTION_HELPER void OrientBasisWithVector(CeedScalar basis[3][3], const CeedScalar vector[3]) { - CeedScalar alignment[3] = {0.}, cross[3]; - - MatVec3(basis, vector, CEED_NOTRANSPOSE, alignment); - - if (alignment[0] < 0) ScaleN(basis[0], -1, 3); - if (alignment[2] < 0) ScaleN(basis[2], -1, 3); - - Cross3(basis[2], basis[0], cross); - CeedScalar basis_1_orientation = Dot3(cross, basis[1]); - if (basis_1_orientation < 0) ScaleN(basis[1], -1, 3); -} - -// @brief Denormalize outputs using min-max (de-)normalization -CEED_QFUNCTION_HELPER void DenormalizeDDOutputs(CeedScalar output[6], const CeedScalar new_bounds[6][2], const CeedScalar old_bounds[6][2]) { - CeedScalar bounds_ratio; - for (int i = 0; i < 6; i++) { - bounds_ratio = (new_bounds[i][1] - new_bounds[i][0]) / (old_bounds[i][1] - old_bounds[i][0]); - output[i] = bounds_ratio * (output[i] - old_bounds[i][1]) + new_bounds[i][1]; - } -} - -/** - * @brief Compute model inputs for anisotropic data-driven model - * - * @param[in] grad_velo_aniso Gradient of velocity in physical (anisotropic) coordinates - * @param[in] km_A_ij Anisotropy tensor, in Kelvin-Mandel notation - * @param[in] delta Length used to create anisotropy tensor - * @param[in] viscosity Kinematic viscosity - * @param[out] eigenvectors Eigenvectors of the (anisotropic) velocity gradient - * @param[out] inputs Data-driven model inputs - * @param[out] grad_velo_magnitude Frobenius norm of the velocity gradient - */ -CEED_QFUNCTION_HELPER void ComputeSgsDDInputs(const CeedScalar grad_velo_aniso[3][3], const CeedScalar km_A_ij[6], const CeedScalar delta, - const CeedScalar viscosity, CeedScalar eigenvectors[3][3], CeedScalar inputs[6], - CeedScalar *grad_velo_magnitude) { - CeedScalar strain_sframe[3] = {0.}, vorticity_sframe[3] = {0.}; - CeedScalar A_ij[3][3] = {{0.}}, grad_velo_iso[3][3] = {{0.}}; - - // -- Transform physical, anisotropic velocity gradient to isotropic - KMUnpack(km_A_ij, A_ij); - MatMat3(grad_velo_aniso, A_ij, CEED_NOTRANSPOSE, CEED_NOTRANSPOSE, grad_velo_iso); - - { // -- Get Eigenframe - CeedScalar kmstrain_iso[6], strain_iso[3][3]; - CeedInt work_vector[3] = {0}; - KMStrainRate(grad_velo_iso, kmstrain_iso); - KMUnpack(kmstrain_iso, strain_iso); - Diagonalize3(strain_iso, strain_sframe, eigenvectors, work_vector, SORT_DECREASING_EVALS, true, 5); - } - - { // -- Get vorticity in S-frame - CeedScalar rotation_iso[3][3]; - RotationRate(grad_velo_iso, rotation_iso); - CeedScalar vorticity_iso[3] = {-2 * rotation_iso[1][2], 2 * rotation_iso[0][2], -2 * rotation_iso[0][1]}; - OrientBasisWithVector(eigenvectors, vorticity_iso); - MatVec3(eigenvectors, vorticity_iso, CEED_NOTRANSPOSE, vorticity_sframe); - } - - // -- Calculate DD model inputs - *grad_velo_magnitude = VelocityGradientMagnitude(strain_sframe, vorticity_sframe); - inputs[0] = strain_sframe[0]; - inputs[1] = strain_sframe[1]; - inputs[2] = strain_sframe[2]; - inputs[3] = vorticity_sframe[0]; - inputs[4] = vorticity_sframe[1]; - inputs[5] = viscosity / Square(delta); - ScaleN(inputs, 1 / (*grad_velo_magnitude + CEED_EPSILON), 6); -} - -/** - * @brief Compute the physical SGS stresses from the neural-network output - * - * @param[in,out] outputs Outputs from the neural-network - * @param[in] delta Length used to create anisotropy tensor - * @param[in] eigenvectors Eigenvectors of the (anisotropic) velocity gradient - * @param[in] new_bounds Bounds used for min-max de-normalization - * @param[in] grad_velo_magnitude Magnitude of the velocity gradient - * @param[out] kmsgs_stress Physical SGS stresses in Kelvin-Mandel notation - */ -CEED_QFUNCTION_HELPER void ComputeSgsDDOutputs(CeedScalar outputs[6], const CeedScalar delta, const CeedScalar eigenvectors[3][3], - const CeedScalar new_bounds[6][2], const CeedScalar grad_velo_magnitude, CeedScalar kmsgs_stress[6]) { - CeedScalar old_bounds[6][2] = {{0}}; - for (int j = 0; j < 6; j++) old_bounds[j][1] = 1; - DenormalizeDDOutputs(outputs, new_bounds, old_bounds); - - // Re-dimensionalize sgs_stress - ScaleN(outputs, Square(delta) * Square(grad_velo_magnitude), 6); - - CeedScalar sgs_stress[3][3] = {{0.}}; - { // Rotate SGS Stress back to physical frame, SGS_physical = E^T SGS_sframe E - CeedScalar Evec_sgs[3][3] = {{0.}}; - const CeedScalar sgs_sframe[3][3] = { - {outputs[0], outputs[3], outputs[4]}, - {outputs[3], outputs[1], outputs[5]}, - {outputs[4], outputs[5], outputs[2]}, - }; - MatMat3(eigenvectors, sgs_sframe, CEED_TRANSPOSE, CEED_NOTRANSPOSE, Evec_sgs); - MatMat3(Evec_sgs, eigenvectors, CEED_NOTRANSPOSE, CEED_NOTRANSPOSE, sgs_stress); - } - - KMPack(sgs_stress, kmsgs_stress); -} diff --git a/examples/fluids/qfunctions/shocktube.h b/examples/fluids/qfunctions/shocktube.h index 87cdf73d4d..64e0798a44 100644 --- a/examples/fluids/qfunctions/shocktube.h +++ b/examples/fluids/qfunctions/shocktube.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -10,8 +10,11 @@ // Model from: // On the Order of Accuracy and Numerical Performance of Two Classes of Finite Volume WENO Schemes, Zhang, Zhang, and Shu (2011). -#include +#include +#ifndef CEED_RUNNING_JIT_PASS #include +#include +#endif #include "utils.h" diff --git a/examples/fluids/qfunctions/stabilization.h b/examples/fluids/qfunctions/stabilization.h index 55d99820c3..87f05823aa 100644 --- a/examples/fluids/qfunctions/stabilization.h +++ b/examples/fluids/qfunctions/stabilization.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -7,7 +7,7 @@ /// @file /// Helper functions for computing stabilization terms of a newtonian simulation -#include +#include #include "newtonian_state.h" diff --git a/examples/fluids/qfunctions/stabilization_types.h b/examples/fluids/qfunctions/stabilization_types.h index 97492dd73a..8544e428e9 100644 --- a/examples/fluids/qfunctions/stabilization_types.h +++ b/examples/fluids/qfunctions/stabilization_types.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause diff --git a/examples/fluids/qfunctions/stg_shur14.h b/examples/fluids/qfunctions/stg_shur14.h index d6c7464660..2e8b05db1c 100644 --- a/examples/fluids/qfunctions/stg_shur14.h +++ b/examples/fluids/qfunctions/stg_shur14.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -12,9 +12,11 @@ /// SetupSTG_Rand reads in the input files and fills in STGShur14Context. /// Then STGShur14_CalcQF is run over quadrature points. /// Before the program exits, TearDownSTG is run to free the memory of the allocated arrays. -#include +#include +#ifndef CEED_RUNNING_JIT_PASS #include #include +#endif #include "newtonian_state.h" #include "setupgeo_helpers.h" @@ -102,12 +104,12 @@ CEED_QFUNCTION_HELPER CeedScalar Calc_qn(const CeedScalar kappa, const CeedScala } // Calculate hmax, ke, keta, and kcut -CEED_QFUNCTION_HELPER void SpectrumConstants(const CeedScalar wall_dist, const CeedScalar eps, const CeedScalar lt, const CeedScalar h[3], +CEED_QFUNCTION_HELPER void SpectrumConstants(const CeedScalar wall_dist, const CeedScalar eps, const CeedScalar lt, const CeedScalar hNodSep[3], const CeedScalar nu, CeedScalar *hmax, CeedScalar *ke, CeedScalar *keta, CeedScalar *kcut) { - *hmax = Max(Max(h[0], h[1]), h[2]); + *hmax = Max(Max(hNodSep[0], hNodSep[1]), hNodSep[2]); *ke = wall_dist == 0 ? 1e16 : 2 * M_PI / Min(2 * wall_dist, 3 * lt); *keta = 2 * M_PI * pow(Cube(nu) / eps, -0.25); - *kcut = M_PI / Min(Max(Max(h[1], h[2]), 0.3 * (*hmax)) + 0.1 * wall_dist, *hmax); + *kcut = M_PI / Min(Max(Max(hNodSep[1], hNodSep[2]), 0.3 * (*hmax)) + 0.1 * wall_dist, *hmax); } /* @@ -115,21 +117,21 @@ CEED_QFUNCTION_HELPER void SpectrumConstants(const CeedScalar wall_dist, const C * * Calculates q_n at a given distance to the wall * - * @param[in] wall_dist Distance to the nearest wall - * @param[in] eps Turbulent dissipation w/rt wall_dist - * @param[in] lt Turbulent length scale w/rt wall_dist - * @param[in] h Element lengths in coordinate directions - * @param[in] nu Dynamic Viscosity; - * @param[in] stg_ctx STGShur14Context for the problem - * @param[out] qn Spectrum coefficients, [nmodes] + * @param[in] wall_dist Distance to the nearest wall + * @param[in] eps Turbulent dissipation w/rt wall_dist + * @param[in] lt Turbulent length scale w/rt wall_dist + * @param[in] h_node_sep Element lengths in coordinate directions + * @param[in] nu Dynamic Viscosity; + * @param[in] stg_ctx STGShur14Context for the problem + * @param[out] qn Spectrum coefficients, [nmodes] */ -CEED_QFUNCTION_HELPER void CalcSpectrum(const CeedScalar wall_dist, const CeedScalar eps, const CeedScalar lt, const CeedScalar h[3], +CEED_QFUNCTION_HELPER void CalcSpectrum(const CeedScalar wall_dist, const CeedScalar eps, const CeedScalar lt, const CeedScalar h_node_sep[3], const CeedScalar nu, CeedScalar qn[], const StgShur14Context stg_ctx) { const CeedInt nmodes = stg_ctx->nmodes; const CeedScalar *kappa = &stg_ctx->data[stg_ctx->offsets.kappa]; CeedScalar hmax, ke, keta, kcut, Ektot = 0.0; - SpectrumConstants(wall_dist, eps, lt, h, nu, &hmax, &ke, &keta, &kcut); + SpectrumConstants(wall_dist, eps, lt, h_node_sep, nu, &hmax, &ke, &keta, &kcut); for (CeedInt n = 0; n < nmodes; n++) { const CeedScalar dkappa = n == 0 ? kappa[0] : kappa[n] - kappa[n - 1]; @@ -181,28 +183,29 @@ CEED_QFUNCTION_HELPER void StgShur14Calc(const CeedScalar X[3], const CeedScalar /****************************************************** * @brief Calculate u(x,t) for STG inflow condition * - * @param[in] X Location to evaluate u(X,t) - * @param[in] t Time to evaluate u(X,t) - * @param[in] ubar Mean velocity at X - * @param[in] cij Cholesky decomposition at X - * @param[in] Ektot Total spectrum energy at this location - * @param[in] h Element size in 3 directions - * @param[in] wall_dist Distance to closest wall - * @param[in] eps Turbulent dissipation - * @param[in] lt Turbulent length scale - * @param[out] u Velocity at X and t - * @param[in] stg_ctx STGShur14Context for the problem + * @param[in] X Location to evaluate u(X,t) + * @param[in] t Time to evaluate u(X,t) + * @param[in] ubar Mean velocity at X + * @param[in] cij Cholesky decomposition at X + * @param[in] Ektot Total spectrum energy at this location + * @param[in] h_node_sep Element size in 3 directions + * @param[in] wall_dist Distance to closest wall + * @param[in] eps Turbulent dissipation + * @param[in] lt Turbulent length scale + * @param[out] u Velocity at X and t + * @param[in] stg_ctx STGShur14Context for the problem */ CEED_QFUNCTION_HELPER void StgShur14Calc_PrecompEktot(const CeedScalar X[3], const CeedScalar t, const CeedScalar ubar[3], const CeedScalar cij[6], - const CeedScalar Ektot, const CeedScalar h[3], const CeedScalar wall_dist, const CeedScalar eps, - const CeedScalar lt, const CeedScalar nu, CeedScalar u[3], const StgShur14Context stg_ctx) { + const CeedScalar Ektot, const CeedScalar h_node_sep[3], const CeedScalar wall_dist, + const CeedScalar eps, const CeedScalar lt, const CeedScalar nu, CeedScalar u[3], + const StgShur14Context stg_ctx) { const CeedInt nmodes = stg_ctx->nmodes; const CeedScalar *kappa = &stg_ctx->data[stg_ctx->offsets.kappa]; const CeedScalar *phi = &stg_ctx->data[stg_ctx->offsets.phi]; const CeedScalar *sigma = &stg_ctx->data[stg_ctx->offsets.sigma]; const CeedScalar *d = &stg_ctx->data[stg_ctx->offsets.d]; CeedScalar hmax, ke, keta, kcut; - SpectrumConstants(wall_dist, eps, lt, h, nu, &hmax, &ke, &keta, &kcut); + SpectrumConstants(wall_dist, eps, lt, h_node_sep, nu, &hmax, &ke, &keta, &kcut); CeedScalar xdotd, vp[3] = {0.}; CeedScalar xhat[] = {0., X[1], X[2]}; @@ -254,12 +257,13 @@ CEED_QFUNCTION(StgShur14Preprocess)(void *ctx, CeedInt Q, const CeedScalar *cons {dXdx_q[1][0][i], dXdx_q[1][1][i], dXdx_q[1][2][i]}, }; - CeedScalar h[3]; - h[0] = dx; - for (CeedInt j = 1; j < 3; j++) h[j] = 2 / sqrt(dXdx[0][j] * dXdx[0][j] + dXdx[1][j] * dXdx[1][j]); + CeedScalar h_node_sep[3]; + h_node_sep[0] = dx; + for (CeedInt j = 1; j < 3; j++) h_node_sep[j] = 2 / sqrt(dXdx[0][j] * dXdx[0][j] + dXdx[1][j] * dXdx[1][j]); + ScaleN(h_node_sep, stg_ctx->h_scale_factor, 3); InterpolateProfile(wall_dist, ubar, cij, &eps, <, stg_ctx); - SpectrumConstants(wall_dist, eps, lt, h, nu, &hmax, &ke, &keta, &kcut); + SpectrumConstants(wall_dist, eps, lt, h_node_sep, nu, &hmax, &ke, &keta, &kcut); // Calculate total TKE per spectrum CeedScalar Ek_tot = 0; @@ -279,48 +283,38 @@ CEED_QFUNCTION(ICsStg)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedSc const CeedScalar(*J)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[1]; CeedScalar(*q0)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[0]; - const StgShur14Context stg_ctx = (StgShur14Context)ctx; - CeedScalar qn[STG_NMODES_MAX], u[3], ubar[3], cij[6], eps, lt; - const CeedScalar dx = stg_ctx->dx; - const CeedScalar time = stg_ctx->time; - const CeedScalar theta0 = stg_ctx->theta0; - const CeedScalar P0 = stg_ctx->P0; - const CeedScalar cv = stg_ctx->newtonian_ctx.cv; - const CeedScalar rho = P0 / (GasConstant(&stg_ctx->newtonian_ctx) * theta0); - const CeedScalar nu = stg_ctx->newtonian_ctx.mu / rho; + const StgShur14Context stg_ctx = (StgShur14Context)ctx; + const NewtonianIdealGasContext gas = &stg_ctx->newtonian_ctx; + CeedScalar qn[STG_NMODES_MAX], u[3], ubar[3], cij[6], eps, lt; + const CeedScalar dx = stg_ctx->dx; + const CeedScalar time = stg_ctx->time; + const CeedScalar theta0 = stg_ctx->theta0; + const CeedScalar P0 = stg_ctx->P0; + const CeedScalar rho = P0 / (GasConstant(gas) * theta0); + const CeedScalar nu = gas->mu / rho; CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { const CeedScalar x_i[3] = {x[0][i], x[1][i], x[2][i]}; CeedScalar dXdx[3][3]; InvertMappingJacobian_3D(Q, i, J, dXdx, NULL); - CeedScalar h[3]; - h[0] = dx; - for (CeedInt j = 1; j < 3; j++) h[j] = 2 / sqrt(Square(dXdx[0][j]) + Square(dXdx[1][j]) + Square(dXdx[2][j])); + CeedScalar h_node_sep[3]; + h_node_sep[0] = dx; + for (CeedInt j = 1; j < 3; j++) h_node_sep[j] = 2 / sqrt(Square(dXdx[0][j]) + Square(dXdx[1][j]) + Square(dXdx[2][j])); + ScaleN(h_node_sep, stg_ctx->h_scale_factor, 3); InterpolateProfile(x_i[1], ubar, cij, &eps, <, stg_ctx); if (stg_ctx->use_fluctuating_IC) { - CalcSpectrum(x_i[1], eps, lt, h, nu, qn, stg_ctx); + CalcSpectrum(x_i[1], eps, lt, h_node_sep, nu, qn, stg_ctx); StgShur14Calc(x_i, time, ubar, cij, qn, u, stg_ctx); } else { for (CeedInt j = 0; j < 3; j++) u[j] = ubar[j]; } - switch (stg_ctx->newtonian_ctx.state_var) { - case STATEVAR_CONSERVATIVE: - q0[0][i] = rho; - q0[1][i] = u[0] * rho; - q0[2][i] = u[1] * rho; - q0[3][i] = u[2] * rho; - q0[4][i] = rho * (0.5 * Dot3(u, u) + cv * theta0); - break; - - case STATEVAR_PRIMITIVE: - q0[0][i] = P0; - q0[1][i] = u[0]; - q0[2][i] = u[1]; - q0[3][i] = u[2]; - q0[4][i] = theta0; - break; + CeedScalar Y[5] = {P0, u[0], u[1], u[2], theta0}, q[5] = {0.}; + State s = StateFromY(gas, Y); + StateToQ(gas, s, q, gas->state_var); + for (CeedInt j = 0; j < 5; j++) { + q0[j][i] = q[j]; } } return 0; @@ -361,13 +355,14 @@ CEED_QFUNCTION(StgShur14Inflow)(void *ctx, CeedInt Q, const CeedScalar *const *i QdataBoundaryUnpack_3D(Q, i, q_data_sur, &wdetJb, dXdx, norm); wdetJb *= is_implicit ? -1. : 1.; - CeedScalar h[3]; - h[0] = dx; - for (CeedInt j = 1; j < 3; j++) h[j] = 2 / sqrt(Square(dXdx[0][j]) + Square(dXdx[1][j])); + CeedScalar h_node_sep[3]; + h_node_sep[0] = dx; + for (CeedInt j = 1; j < 3; j++) h_node_sep[j] = 2 / sqrt(Square(dXdx[0][j]) + Square(dXdx[1][j])); + ScaleN(h_node_sep, stg_ctx->h_scale_factor, 3); InterpolateProfile(X[1][i], ubar, cij, &eps, <, stg_ctx); if (!mean_only) { - CalcSpectrum(X[1][i], eps, lt, h, mu / rho, qn, stg_ctx); + CalcSpectrum(X[1][i], eps, lt, h_node_sep, mu / rho, qn, stg_ctx); StgShur14Calc(x, time, ubar, cij, qn, u, stg_ctx); } else { for (CeedInt j = 0; j < 3; j++) u[j] = ubar[j]; @@ -477,15 +472,16 @@ CEED_QFUNCTION(StgShur14InflowStrongQF)(void *ctx, CeedInt Q, const CeedScalar * const CeedScalar(*inv_Ektotal) = (const CeedScalar(*))in[3]; CeedScalar(*bcval)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[0]; - const StgShur14Context stg_ctx = (StgShur14Context)ctx; - CeedScalar u[3], ubar[3], cij[6], eps, lt; - const bool mean_only = stg_ctx->mean_only; - const CeedScalar dx = stg_ctx->dx; - const CeedScalar time = stg_ctx->time; - const CeedScalar theta0 = stg_ctx->theta0; - const CeedScalar P0 = stg_ctx->P0; - const CeedScalar rho = P0 / (GasConstant(&stg_ctx->newtonian_ctx) * theta0); - const CeedScalar nu = stg_ctx->newtonian_ctx.mu / rho; + const StgShur14Context stg_ctx = (StgShur14Context)ctx; + const NewtonianIdealGasContext gas = &stg_ctx->newtonian_ctx; + CeedScalar u[3], ubar[3], cij[6], eps, lt; + const bool mean_only = stg_ctx->mean_only; + const CeedScalar dx = stg_ctx->dx; + const CeedScalar time = stg_ctx->time; + const CeedScalar theta0 = stg_ctx->theta0; + const CeedScalar P0 = stg_ctx->P0; + const CeedScalar rho = P0 / (GasConstant(gas) * theta0); + const CeedScalar nu = gas->mu / rho; CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { const CeedScalar x[] = {coords[0][i], coords[1][i], coords[2][i]}; @@ -494,40 +490,41 @@ CEED_QFUNCTION(StgShur14InflowStrongQF)(void *ctx, CeedInt Q, const CeedScalar * {dXdx_q[1][0][i], dXdx_q[1][1][i], dXdx_q[1][2][i]}, }; - CeedScalar h[3]; - h[0] = dx; - for (CeedInt j = 1; j < 3; j++) h[j] = 2 / sqrt(Square(dXdx[0][j]) + Square(dXdx[1][j])); + CeedScalar h_node_sep[3]; + h_node_sep[0] = dx; + for (CeedInt j = 1; j < 3; j++) h_node_sep[j] = 2 / sqrt(Square(dXdx[0][j]) + Square(dXdx[1][j])); + ScaleN(h_node_sep, stg_ctx->h_scale_factor, 3); InterpolateProfile(coords[1][i], ubar, cij, &eps, <, stg_ctx); if (!mean_only) { if (1) { - StgShur14Calc_PrecompEktot(x, time, ubar, cij, inv_Ektotal[i], h, x[1], eps, lt, nu, u, stg_ctx); + StgShur14Calc_PrecompEktot(x, time, ubar, cij, inv_Ektotal[i], h_node_sep, x[1], eps, lt, nu, u, stg_ctx); } else { // Original way CeedScalar qn[STG_NMODES_MAX]; - CalcSpectrum(coords[1][i], eps, lt, h, nu, qn, stg_ctx); + CalcSpectrum(coords[1][i], eps, lt, h_node_sep, nu, qn, stg_ctx); StgShur14Calc(x, time, ubar, cij, qn, u, stg_ctx); } } else { for (CeedInt j = 0; j < 3; j++) u[j] = ubar[j]; } - switch (stg_ctx->newtonian_ctx.state_var) { + CeedScalar Y[5] = {P0, u[0], u[1], u[2], theta0}, q[5] = {0.}; + State s = StateFromY(gas, Y); + StateToQ(gas, s, q, gas->state_var); + switch (gas->state_var) { case STATEVAR_CONSERVATIVE: - bcval[0][i] = scale[i] * rho; - bcval[1][i] = scale[i] * rho * u[0]; - bcval[2][i] = scale[i] * rho * u[1]; - bcval[3][i] = scale[i] * rho * u[2]; - bcval[4][i] = 0.; + q[4] = 0.; // Don't set energy break; - case STATEVAR_PRIMITIVE: - bcval[0][i] = 0; - bcval[1][i] = scale[i] * u[0]; - bcval[2][i] = scale[i] * u[1]; - bcval[3][i] = scale[i] * u[2]; - bcval[4][i] = scale[i] * theta0; + q[0] = 0; // Don't set pressure + break; + case STATEVAR_ENTROPY: + q[0] = 0; // Don't set V_density break; } + for (CeedInt j = 0; j < 5; j++) { + bcval[j][i] = scale[i] * q[j]; + } } return 0; } diff --git a/examples/fluids/qfunctions/stg_shur14_type.h b/examples/fluids/qfunctions/stg_shur14_type.h index a8ed21c292..945956de84 100644 --- a/examples/fluids/qfunctions/stg_shur14_type.h +++ b/examples/fluids/qfunctions/stg_shur14_type.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -6,7 +6,10 @@ // This file is part of CEED: http://github.com/ceed #pragma once -#include +#include +#ifndef CEED_RUNNING_JIT_PASS +#include +#endif #include "newtonian_types.h" @@ -25,6 +28,7 @@ struct STGShur14Context_ { bool is_implicit; // !< Whether using implicit time integration bool mean_only; // !< Only apply the mean profile CeedScalar dx; // !< dx used for h calculation + CeedScalar h_scale_factor; // !< Scales the element size bool prescribe_T; // !< Prescribe temperature weakly bool use_fluctuating_IC; // !< Only apply the mean profile struct NewtonianIdealGasContext_ newtonian_ctx; diff --git a/examples/fluids/qfunctions/strong_boundary_conditions.h b/examples/fluids/qfunctions/strong_boundary_conditions.h index a503a236d9..1526580963 100644 --- a/examples/fluids/qfunctions/strong_boundary_conditions.h +++ b/examples/fluids/qfunctions/strong_boundary_conditions.h @@ -1,10 +1,10 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause // // This file is part of CEED: http://github.com/ceed -#include +#include #include "setupgeo_helpers.h" diff --git a/examples/fluids/qfunctions/taylorgreen.h b/examples/fluids/qfunctions/taylorgreen.h index 72c128400d..c28e718913 100644 --- a/examples/fluids/qfunctions/taylorgreen.h +++ b/examples/fluids/qfunctions/taylorgreen.h @@ -1,11 +1,13 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause // // This file is part of CEED: http://github.com/ceed -#include +#include +#ifndef CEED_RUNNING_JIT_PASS #include +#endif #include "newtonian_state.h" #include "newtonian_types.h" @@ -17,12 +19,12 @@ CEED_QFUNCTION(ICsTaylorGreen)(void *ctx, CeedInt Q, const CeedScalar *const *in CeedScalar(*q0)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[0]; - const SetupContext context = (SetupContext)ctx; - struct NewtonianIdealGasContext_ *gas = &context->gas; - CeedScalar R = GasConstant(gas); - StatePrimitive reference = context->reference; - const CeedScalar V0 = sqrt(Dot3(reference.velocity, reference.velocity)); - const CeedScalar density0 = reference.pressure / (reference.temperature * R); + const SetupContext context = (SetupContext)ctx; + const NewtonianIdealGasContext gas = &context->gas; + CeedScalar R = GasConstant(gas); + StatePrimitive reference = context->reference; + const CeedScalar V0 = sqrt(Dot3(reference.velocity, reference.velocity)); + const CeedScalar density0 = reference.pressure / (reference.temperature * R); CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { CeedScalar x[] = {X[0][i], X[1][i], X[2][i]}; @@ -36,15 +38,7 @@ CEED_QFUNCTION(ICsTaylorGreen)(void *ctx, CeedInt Q, const CeedScalar *const *in Y[4] = reference.temperature; State s = StateFromY(gas, Y); - switch (gas->state_var) { - case STATEVAR_CONSERVATIVE: - UnpackState_U(s.U, q); - break; - case STATEVAR_PRIMITIVE: - UnpackState_Y(s.Y, q); - break; - } - + StateToQ(gas, s, q, gas->state_var); for (CeedInt j = 0; j < 5; j++) q0[j][i] = q[j]; } return 0; diff --git a/examples/fluids/qfunctions/turb_spanstats.h b/examples/fluids/qfunctions/turb_spanstats.h index dccba29a7b..6331b119e9 100644 --- a/examples/fluids/qfunctions/turb_spanstats.h +++ b/examples/fluids/qfunctions/turb_spanstats.h @@ -1,10 +1,10 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause // // This file is part of CEED: http://github.com/ceed -#include +#include #include "newtonian_state.h" #include "turb_stats_types.h" @@ -59,6 +59,10 @@ CEED_QFUNCTION(ChildStatsCollection_Prim)(void *ctx, CeedInt Q, const CeedScalar return ChildStatsCollection(ctx, Q, in, out, STATEVAR_PRIMITIVE); } +CEED_QFUNCTION(ChildStatsCollection_Entropy)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { + return ChildStatsCollection(ctx, Q, in, out, STATEVAR_ENTROPY); +} + // QFunctions for testing CEED_QFUNCTION_HELPER CeedScalar ChildStatsCollectionTest_Exact(const CeedScalar x_i[3]) { return x_i[0] + Square(x_i[1]); } diff --git a/examples/fluids/qfunctions/turb_stats_types.h b/examples/fluids/qfunctions/turb_stats_types.h index 95136f9ff0..dccae3653a 100644 --- a/examples/fluids/qfunctions/turb_stats_types.h +++ b/examples/fluids/qfunctions/turb_stats_types.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause diff --git a/examples/fluids/qfunctions/utils.h b/examples/fluids/qfunctions/utils.h index f414e14e9c..bd9d787efc 100644 --- a/examples/fluids/qfunctions/utils.h +++ b/examples/fluids/qfunctions/utils.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -6,8 +6,10 @@ // This file is part of CEED: http://github.com/ceed #pragma once -#include +#include +#ifndef CEED_RUNNING_JIT_PASS #include +#endif #ifndef M_PI #define M_PI 3.14159265358979323846 diff --git a/examples/fluids/qfunctions/utils_eigensolver_jacobi.h b/examples/fluids/qfunctions/utils_eigensolver_jacobi.h index b8236789d2..71587633dd 100644 --- a/examples/fluids/qfunctions/utils_eigensolver_jacobi.h +++ b/examples/fluids/qfunctions/utils_eigensolver_jacobi.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -9,8 +9,11 @@ /// Eigen system solver for symmetric NxN matrices. Modified from the CC0 code provided at https://github.com/jewettaij/jacobi_pd #pragma once -#include +#include +#ifndef CEED_RUNNING_JIT_PASS #include +#include +#endif #include "utils.h" diff --git a/examples/fluids/qfunctions/velocity_gradient_projection.h b/examples/fluids/qfunctions/velocity_gradient_projection.h index 73b51eff84..2fecc3f258 100644 --- a/examples/fluids/qfunctions/velocity_gradient_projection.h +++ b/examples/fluids/qfunctions/velocity_gradient_projection.h @@ -1,10 +1,10 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause // // This file is part of CEED: http://github.com/ceed -#include +#include #include "newtonian_state.h" #include "newtonian_types.h" @@ -47,3 +47,7 @@ CEED_QFUNCTION(VelocityGradientProjectionRHS_Conserv)(void *ctx, CeedInt Q, cons CEED_QFUNCTION(VelocityGradientProjectionRHS_Prim)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { return VelocityGradientProjectionRHS(ctx, Q, in, out, STATEVAR_PRIMITIVE); } + +CEED_QFUNCTION(VelocityGradientProjectionRHS_Entropy)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { + return VelocityGradientProjectionRHS(ctx, Q, in, out, STATEVAR_ENTROPY); +} diff --git a/examples/fluids/smartsim_regression_framework.py b/examples/fluids/smartsim_regression_framework.py deleted file mode 100755 index 2834263e6a..0000000000 --- a/examples/fluids/smartsim_regression_framework.py +++ /dev/null @@ -1,241 +0,0 @@ -#!/usr/bin/env python3 -from junit_xml import TestCase -from smartsim import Experiment -from smartsim.settings import RunSettings -from smartredis import Client -import numpy as np -from pathlib import Path -import argparse -import traceback -import sys -import time -from typing import Tuple -import os -import shutil -import logging -import socket - -# autopep8 off -sys.path.insert(0, (Path(__file__).parents[3] / "tests/junit-xml").as_posix()) -# autopep8 on - -logging.disable(logging.WARNING) - -fluids_example_dir = Path(__file__).parent.absolute() - - -def getOpenSocket(): - s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) - s.bind(('', 0)) - addr = s.getsockname() - s.close() - return addr[1] - - -class NoError(Exception): - pass - - -def assert_np_all(test, truth): - """Assert with better error reporting""" - try: - assert np.all(test == truth) - except Exception as e: - raise Exception(f"Expected {truth}, but got {test}") from e - - -def assert_equal(test, truth): - """Assert with better error reporting""" - try: - assert test == truth - except Exception as e: - raise Exception(f"Expected {truth}, but got {test}") from e - - -def verify_training_data(database_array, correct_array, ceed_resource, atol=1e-8, rtol=1e-8): - """Verify the training data - - Cannot just use np.allclose due to vorticity vector directionality. - Check whether the S-frame-oriented vorticity vector's second component is just flipped. - This can happen due to the eigenvector ordering changing based on whichever one is closest to the vorticity vector. - If two eigenvectors are very close to the vorticity vector, this can cause the ordering to flip. - This flipping of the vorticity vector is not incorrect, just a known sensitivity of the model. - """ - if not np.allclose(database_array, correct_array, atol=atol, rtol=rtol): - - total_tolerances = atol + rtol * np.abs(correct_array) # mimic np.allclose tolerance calculation - idx_notclose = np.where(np.abs(database_array - correct_array) > total_tolerances) - if not np.all(idx_notclose[1] == 4): - # values other than vorticity are not close - test_fail = True - else: - database_vorticity = database_array[idx_notclose] - correct_vorticity = correct_array[idx_notclose] - test_fail = False if np.allclose(-database_vorticity, correct_vorticity, - atol=atol, rtol=rtol) else True - - if test_fail: - database_output_path = Path( - f"./y0_database_values_{ceed_resource.replace('/', '_')}.npy").absolute() - np.save(database_output_path, database_array) - raise AssertionError(f"Array values in database max difference: {np.max(np.abs(correct_array - database_array))}\n" - f"Array saved to {database_output_path.as_posix()}") - - -class SmartSimTest(object): - - def __init__(self, directory_path: Path): - self.exp: Experiment - self.database = None - self.directory_path: Path = directory_path - self.original_path: Path - - def setup(self): - """To create the test directory and start SmartRedis database""" - self.original_path = Path(os.getcwd()) - - if self.directory_path.exists() and self.directory_path.is_dir(): - shutil.rmtree(self.directory_path) - self.directory_path.mkdir() - os.chdir(self.directory_path) - - PORT = getOpenSocket() - self.exp = Experiment("test", launcher="local") - self.database = self.exp.create_database(port=PORT, batch=False, interface="lo") - self.exp.generate(self.database) - self.exp.start(self.database) - - # SmartRedis will complain if these aren't set - os.environ['SR_LOG_FILE'] = 'R' - os.environ['SR_LOG_LEVEL'] = 'INFO' - - def test(self, ceed_resource) -> Tuple[bool, Exception, str]: - client = None - arguments = [] - exe_path = "../../build/fluids-navierstokes" - try: - arguments = [ - '-ceed', ceed_resource, - '-options_file', (fluids_example_dir / 'blasius.yaml').as_posix(), - '-ts_max_steps', '2', - '-diff_filter_grid_based_width', - '-ts_monitor', '-snes_monitor', - '-diff_filter_ksp_max_it', '50', '-diff_filter_ksp_monitor', - '-degree', '1', - '-sgs_train_enable', - '-sgs_train_write_data_interval', '2', - '-sgs_train_filter_width_scales', '1.2,3.1', - '-bc_symmetry_z', - '-dm_plex_shape', 'zbox', - '-dm_plex_box_bd', 'none,none,periodic', - '-dm_plex_box_faces', '4,6,1', - '-mesh_transform', - ] - - run_settings = RunSettings(exe_path, exe_args=arguments) - - client_exp = self.exp.create_model(f"client_{ceed_resource.replace('/', '_')}", run_settings) - - # Start the client model - self.exp.start(client_exp, summary=False, block=True) - - client = Client(cluster=False, address=self.database.get_address()[0]) - - assert client.poll_tensor("sizeInfo", 250, 5) - assert_np_all(client.get_tensor("sizeInfo"), np.array([35, 12, 6, 1, 1, 0])) - - assert client.poll_tensor("check-run", 250, 5) - assert_equal(client.get_tensor("check-run")[0], 1) - - assert client.poll_tensor("tensor-ow", 250, 5) - assert_equal(client.get_tensor("tensor-ow")[0], 1) - - assert client.poll_tensor("num_filter_widths", 250, 5) - assert_equal(client.get_tensor("num_filter_widths")[0], 2) - - assert client.poll_tensor("step", 250, 5) - assert_equal(client.get_tensor("step")[0], 2) - - assert client.poll_tensor("y.0.0", 250, 5) - test_data_path = fluids_example_dir / "tests-output/y00_output.npy" - assert test_data_path.is_file() - correct_value = np.load(test_data_path) - database_value = client.get_tensor("y.0.0") - verify_training_data(database_value, correct_value, ceed_resource) - - assert client.poll_tensor("y.0.1", 250, 5) - test_data_path = fluids_example_dir / "tests-output/y01_output.npy" - assert test_data_path.is_file() - correct_value = np.load(test_data_path) - database_value = client.get_tensor("y.0.1") - verify_training_data(database_value, correct_value, ceed_resource) - - client.flush_db([os.environ["SSDB"]]) - output = (True, NoError(), exe_path + ' ' + ' '.join(arguments)) - except Exception as e: - output = (False, e, exe_path + ' ' + ' '.join(arguments)) - - finally: - if client: - client.flush_db([os.environ["SSDB"]]) - - return output - - def test_junit(self, ceed_resource): - start: float = time.time() - - passTest, exception, args = self.test(ceed_resource) - - output = "" if isinstance(exception, NoError) else ''.join( - traceback.TracebackException.from_exception(exception).format()) - - test_case = TestCase(f'SmartSim Test {ceed_resource}', - elapsed_sec=time.time() - start, - timestamp=time.strftime( - '%Y-%m-%d %H:%M:%S %Z', time.localtime(start)), - stdout=output, - stderr=output, - allow_multiple_subelements=True, - category=f'SmartSim Tests') - test_case.args = args - if not passTest and 'occa' in ceed_resource: - test_case.add_skipped_info("OCCA mode not supported") - elif not passTest: - test_case.add_failure_info("exception", output) - - return test_case - - def teardown(self): - self.exp.stop(self.database) - os.chdir(self.original_path) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser('Testing script for SmartSim integration') - parser.add_argument( - '-c', - '--ceed-backends', - type=str, - nargs='*', - default=['/cpu/self'], - help='libCEED backend to use with convergence tests') - args = parser.parse_args() - - test_dir = fluids_example_dir / "test_dir" - print("Setting up database...", end='') - test_framework = SmartSimTest(test_dir) - test_framework.setup() - print(" Done!") - for ceed_resource in args.ceed_backends: - print("working on " + ceed_resource + ' ...', end='') - passTest, exception, _ = test_framework.test(ceed_resource) - - if passTest: - print("Passed!") - else: - print("Failed!", file=sys.stderr) - print('\t' + ''.join(traceback.TracebackException.from_exception(exception).format()), file=sys.stderr) - - print("Cleaning up database...", end='') - test_framework.teardown() - print(" Done!") diff --git a/examples/fluids/src/bc_definition.c b/examples/fluids/src/bc_definition.c new file mode 100644 index 0000000000..acdb50a370 --- /dev/null +++ b/examples/fluids/src/bc_definition.c @@ -0,0 +1,106 @@ +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. +// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +// +// SPDX-License-Identifier: BSD-2-Clause +// +// This file is part of CEED: http://github.com/ceed + +#include + +/** + @brief Create `BCDefinition` + + @param[in] name Name of the boundary condition + @param[in] num_label_values Number of `DMLabel` values + @param[in] label_values Array of label values that define the boundaries controlled by the `BCDefinition`, size `num_label_values` + @param[out] bc_def The new `BCDefinition` +**/ +PetscErrorCode BCDefinitionCreate(const char *name, PetscInt num_label_values, PetscInt label_values[], BCDefinition *bc_def) { + PetscFunctionBeginUser; + PetscCall(PetscNew(bc_def)); + + PetscCall(PetscStrallocpy(name, &(*bc_def)->name)); + (*bc_def)->num_label_values = num_label_values; + PetscCall(PetscMalloc1(num_label_values, &(*bc_def)->label_values)); + for (PetscInt i = 0; i < num_label_values; i++) (*bc_def)->label_values[i] = label_values[i]; + PetscFunctionReturn(PETSC_SUCCESS); +} + +/** + @brief Get base information for `BCDefinition` + + @param[in] bc_def `BCDefinition` to get information from + @param[out] name Name of the `BCDefinition` + @param[out] num_label_values Number of `DMLabel` values + @param[out] label_values Array of label values that define the boundaries controlled by the `BCDefinition`, size `num_label_values` +**/ +PetscErrorCode BCDefinitionGetInfo(BCDefinition bc_def, const char *name[], PetscInt *num_label_values, const PetscInt *label_values[]) { + PetscFunctionBeginUser; + if (name) *name = bc_def->name; + if (label_values) { + *num_label_values = bc_def->num_label_values; + *label_values = bc_def->label_values; + } + PetscFunctionReturn(PETSC_SUCCESS); +} + +/** + @brief Destory a `BCDefinition` object + + @param[in,out] bc_def `BCDefinition` to be destroyed +**/ +PetscErrorCode BCDefinitionDestroy(BCDefinition *bc_def) { + PetscFunctionBeginUser; + if ((*bc_def)->name) PetscCall(PetscFree((*bc_def)->name)); + if ((*bc_def)->label_values) PetscCall(PetscFree((*bc_def)->label_values)); + if ((*bc_def)->essential_comps) PetscCall(PetscFree((*bc_def)->essential_comps)); + PetscCall(PetscFree(*bc_def)); + *bc_def = NULL; + PetscFunctionReturn(PETSC_SUCCESS); +} + +/** + @brief Set `DM_BC_ESSENTIAL` boundary condition values + + @param[in,out] bc_def `BCDefinition` to set values to + @param[in] num_essential_comps Number of components to set + @param[in] essential_comps Array of components to set, size `num_essential_comps` +**/ +PetscErrorCode BCDefinitionSetEssential(BCDefinition bc_def, PetscInt num_essential_comps, PetscInt essential_comps[]) { + PetscFunctionBeginUser; + bc_def->num_essential_comps = num_essential_comps; + PetscCall(PetscMalloc1(num_essential_comps, &bc_def->essential_comps)); + PetscCall(PetscArraycpy(bc_def->essential_comps, essential_comps, num_essential_comps)); + PetscFunctionReturn(PETSC_SUCCESS); +} + +/** + @brief Get `DM_BC_ESSENTIAL` boundary condition values + + @param[in] bc_def `BCDefinition` to set values to + @param[out] num_essential_comps Number of components to set + @param[out] essential_comps Array of components to set, size `num_essential_comps` +**/ +PetscErrorCode BCDefinitionGetEssential(BCDefinition bc_def, PetscInt *num_essential_comps, const PetscInt *essential_comps[]) { + PetscFunctionBeginUser; + *num_essential_comps = bc_def->num_essential_comps; + *essential_comps = bc_def->essential_comps; + PetscFunctionReturn(PETSC_SUCCESS); +} + +#define LABEL_ARRAY_SIZE 256 + +// @brief See `PetscOptionsBCDefinition` +PetscErrorCode PetscOptionsBCDefinition_Private(PetscOptionItems PetscOptionsObject, const char opt[], const char text[], const char man[], + const char name[], BCDefinition *bc_def, PetscBool *set) { + PetscInt num_label_values = LABEL_ARRAY_SIZE, label_values[LABEL_ARRAY_SIZE] = {0}; + + PetscFunctionBeginUser; + PetscCall(PetscOptionsIntArray(opt, text, man, label_values, &num_label_values, set)); + if (num_label_values > 0) { + PetscCall(BCDefinitionCreate(name, num_label_values, label_values, bc_def)); + } else { + *bc_def = NULL; + } + PetscFunctionReturn(PETSC_SUCCESS); +} diff --git a/examples/fluids/src/boundary_condition.c b/examples/fluids/src/boundary_condition.c new file mode 100644 index 0000000000..89e917634d --- /dev/null +++ b/examples/fluids/src/boundary_condition.c @@ -0,0 +1,100 @@ +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. +// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +// +// SPDX-License-Identifier: BSD-2-Clause +// +// This file is part of CEED: http://github.com/ceed + +#include "../navierstokes.h" + +/** + @brief Add `BCDefinition` to a `PetscSegBuffer` + + @param[in] bc_def `BCDefinition` to add + @param[in,out] bc_defs_seg `PetscSegBuffer` to add to +**/ +static PetscErrorCode AddBCDefinitionToSegBuffer(BCDefinition bc_def, PetscSegBuffer bc_defs_seg) { + BCDefinition *bc_def_ptr; + + PetscFunctionBeginUser; + if (bc_def == NULL) PetscFunctionReturn(PETSC_SUCCESS); + PetscCall(PetscSegBufferGet(bc_defs_seg, 1, &bc_def_ptr)); + *bc_def_ptr = bc_def; + PetscFunctionReturn(PETSC_SUCCESS); +} + +/** + @brief Create and setup `BCDefinition`s and `SimpleBC` from commandline options + + @param[in] user `User` + @param[in,out] problem `ProblemData` + @param[in] app_ctx `AppCtx` + @param[in,out] bc `SimpleBC` +**/ +PetscErrorCode BoundaryConditionSetUp(User user, ProblemData problem, AppCtx app_ctx, SimpleBC bc) { + PetscSegBuffer bc_defs_seg; + PetscBool flg; + BCDefinition bc_def; + + PetscFunctionBeginUser; + PetscCall(PetscSegBufferCreate(sizeof(BCDefinition), 4, &bc_defs_seg)); + + PetscOptionsBegin(user->comm, NULL, "Boundary Condition Options", NULL); + + PetscCall(PetscOptionsBCDefinition("-bc_wall", "Face IDs to apply wall BC", NULL, "wall", &bc_def, NULL)); + PetscCall(AddBCDefinitionToSegBuffer(bc_def, bc_defs_seg)); + if (bc_def) { + PetscInt num_essential_comps = 16, essential_comps[16]; + + PetscCall(PetscOptionsIntArray("-wall_comps", "An array of constrained component numbers", NULL, essential_comps, &num_essential_comps, &flg)); + PetscCall(BCDefinitionSetEssential(bc_def, num_essential_comps, essential_comps)); + + app_ctx->wall_forces.num_wall = bc_def->num_label_values; + PetscCall(PetscMalloc1(bc_def->num_label_values, &app_ctx->wall_forces.walls)); + PetscCall(PetscArraycpy(app_ctx->wall_forces.walls, bc_def->label_values, bc_def->num_label_values)); + } + + { // Symmetry Boundary Conditions + const char *deprecated[3] = {"-bc_slip_x", "-bc_slip_y", "-bc_slip_z"}; + const char *flags[3] = {"-bc_symmetry_x", "-bc_symmetry_y", "-bc_symmetry_z"}; + + for (PetscInt j = 0; j < 3; j++) { + PetscCall(PetscOptionsDeprecated(deprecated[j], flags[j], "libCEED 0.12.0", + "Use -bc_symmetry_[x,y,z] for direct equivalency, or -bc_slip for weak, Riemann-based, direction-invariant " + "slip/no-penatration boundary conditions")); + PetscCall(PetscOptionsBCDefinition(flags[j], "Face IDs to apply symmetry BC", NULL, "symmetry", &bc_def, NULL)); + if (!bc_def) { + PetscCall(PetscOptionsBCDefinition(deprecated[j], "Face IDs to apply symmetry BC", NULL, "symmetry", &bc_def, NULL)); + } + PetscCall(AddBCDefinitionToSegBuffer(bc_def, bc_defs_seg)); + if (bc_def) { + PetscInt essential_comps[1] = {j + 1}; + + PetscCall(BCDefinitionSetEssential(bc_def, 1, essential_comps)); + } + } + } + + // Inflow BCs + bc->num_inflow = 16; + PetscCall(PetscOptionsIntArray("-bc_inflow", "Face IDs to apply inflow BC", NULL, bc->inflows, &bc->num_inflow, NULL)); + // Outflow BCs + bc->num_outflow = 16; + PetscCall(PetscOptionsIntArray("-bc_outflow", "Face IDs to apply outflow BC", NULL, bc->outflows, &bc->num_outflow, NULL)); + // Freestream BCs + bc->num_freestream = 16; + PetscCall(PetscOptionsIntArray("-bc_freestream", "Face IDs to apply freestream BC", NULL, bc->freestreams, &bc->num_freestream, NULL)); + + bc->num_slip = 16; + PetscCall(PetscOptionsIntArray("-bc_slip", "Face IDs to apply slip BC", NULL, bc->slips, &bc->num_slip, NULL)); + + PetscOptionsEnd(); + + PetscCall(PetscSegBufferGetSize(bc_defs_seg, &problem->num_bc_defs)); + PetscCall(PetscSegBufferExtractAlloc(bc_defs_seg, &problem->bc_defs)); + PetscCall(PetscSegBufferDestroy(&bc_defs_seg)); + + //TODO: Verify that the BCDefinition don't have overlapping claims to boundary faces + + PetscFunctionReturn(PETSC_SUCCESS); +} diff --git a/examples/fluids/src/cloptions.c b/examples/fluids/src/cloptions.c index 1fa601231a..905144216c 100644 --- a/examples/fluids/src/cloptions.c +++ b/examples/fluids/src/cloptions.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -134,60 +134,6 @@ PetscErrorCode ProcessCommandLineOptions(MPI_Comm comm, AppCtx app_ctx, SimpleBC strncpy(app_ctx->problem_name, problem_name, 16); } - // Wall Boundary Conditions - bc->num_wall = 16; - PetscBool flg; - PetscCall(PetscOptionsIntArray("-bc_wall", "Face IDs to apply wall BC", NULL, bc->walls, &bc->num_wall, NULL)); - bc->num_comps = 5; - PetscCall(PetscOptionsIntArray("-wall_comps", "An array of constrained component numbers", NULL, bc->wall_comps, &bc->num_comps, &flg)); - - { // Symmetry Boundary Conditions - const char *deprecated[3] = {"-bc_slip_x", "-bc_slip_y", "-bc_slip_z"}; - const char *flags[3] = {"-bc_symmetry_x", "-bc_symmetry_y", "-bc_symmetry_z"}; - PetscBool flg, has_symmetry = PETSC_FALSE; - - for (PetscInt j = 0; j < 3; j++) { - bc->num_symmetry[j] = 16; - PetscCall(PetscOptionsDeprecated(deprecated[j], flags[j], "libCEED 0.12.0", - "Use -bc_symmetry_[x,y,z] for direct equivalency, or -bc_slip for weak, Riemann-based, direction-invariant " - "slip/no-penatration boundary conditions")); - PetscCall(PetscOptionsIntArray(flags[j], "Face IDs to apply symmetry BC", NULL, bc->symmetries[j], &bc->num_symmetry[j], &flg)); - if (!flg) { - bc->num_symmetry[j] = 16; - PetscCall(PetscOptionsIntArray(deprecated[j], "Face IDs to apply slip BC", NULL, bc->symmetries[j], &bc->num_symmetry[j], &flg)); - } - if (bc->num_symmetry[j] > 0) has_symmetry = PETSC_TRUE; - } - - // Error if wall and symmetry BCs are set on the same face - if (has_symmetry) { - for (PetscInt c = 0; c < 3; c++) { - for (PetscInt s = 0; s < bc->num_symmetry[c]; s++) { - for (PetscInt w = 0; w < bc->num_wall; w++) { - PetscCheck(bc->symmetries[c][s] != bc->walls[w], PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, - "Boundary condition already set on face %" PetscInt_FMT "!\n", bc->walls[w]); - } - } - } - } - } - app_ctx->wall_forces.num_wall = bc->num_wall; - PetscCall(PetscMalloc1(bc->num_wall, &app_ctx->wall_forces.walls)); - PetscCall(PetscArraycpy(app_ctx->wall_forces.walls, bc->walls, bc->num_wall)); - - // Inflow BCs - bc->num_inflow = 16; - PetscCall(PetscOptionsIntArray("-bc_inflow", "Face IDs to apply inflow BC", NULL, bc->inflows, &bc->num_inflow, NULL)); - // Outflow BCs - bc->num_outflow = 16; - PetscCall(PetscOptionsIntArray("-bc_outflow", "Face IDs to apply outflow BC", NULL, bc->outflows, &bc->num_outflow, NULL)); - // Freestream BCs - bc->num_freestream = 16; - PetscCall(PetscOptionsIntArray("-bc_freestream", "Face IDs to apply freestream BC", NULL, bc->freestreams, &bc->num_freestream, NULL)); - - bc->num_slip = 16; - PetscCall(PetscOptionsIntArray("-bc_slip", "Face IDs to apply slip BC", NULL, bc->slips, &bc->num_slip, NULL)); - // Statistics Options app_ctx->turb_spanstats_collect_interval = 1; PetscCall(PetscOptionsInt("-ts_monitor_turbulence_spanstats_collect_interval", "Number of timesteps between statistics collection", NULL, @@ -203,11 +149,6 @@ PetscErrorCode ProcessCommandLineOptions(MPI_Comm comm, AppCtx app_ctx, SimpleBC PetscCall(PetscOptionsViewer("-ts_monitor_wall_force", "Viewer for force on each (no-slip) wall", NULL, &app_ctx->wall_forces.viewer, &app_ctx->wall_forces.viewer_format, NULL)); - // SGS Model Options - app_ctx->sgs_model_type = SGS_MODEL_NONE; - PetscCall(PetscOptionsEnum("-sgs_model_type", "Subgrid Stress Model type", NULL, SGSModelTypes, (PetscEnum)app_ctx->sgs_model_type, - (PetscEnum *)&app_ctx->sgs_model_type, NULL)); - PetscCall(PetscOptionsBool("-diff_filter_monitor", "Enable differential filtering TSMonitor", NULL, app_ctx->diff_filter_monitor, &app_ctx->diff_filter_monitor, NULL)); @@ -216,9 +157,6 @@ PetscErrorCode ProcessCommandLineOptions(MPI_Comm comm, AppCtx app_ctx, SimpleBC PetscCall(PetscOptionsEnum("-mesh_transform", "Mesh transform to perform", NULL, MeshTransformTypes, (PetscEnum)app_ctx->mesh_transform_type, (PetscEnum *)&app_ctx->mesh_transform_type, NULL)); - PetscCall( - PetscOptionsBool("-sgs_train_enable", "Enable Data-Driven SGS training", NULL, app_ctx->sgs_train_enable, &app_ctx->sgs_train_enable, NULL)); - PetscOptionsEnd(); PetscFunctionReturn(PETSC_SUCCESS); } diff --git a/examples/fluids/src/differential_filter.c b/examples/fluids/src/differential_filter.c index 414c7154f8..c3f1478867 100644 --- a/examples/fluids/src/differential_filter.c +++ b/examples/fluids/src/differential_filter.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -8,6 +8,7 @@ /// Functions for setting up and performing differential filtering #include "../qfunctions//differential_filter.h" +#include #include @@ -36,8 +37,9 @@ PetscErrorCode DifferentialFilterCreateOperators(Ceed ceed, User user, CeedData case STATEVAR_CONSERVATIVE: PetscCallCeed(ceed, CeedQFunctionCreateInterior(ceed, 1, DifferentialFilter_RHS_Conserv, DifferentialFilter_RHS_Conserv_loc, &qf_rhs)); break; - default: - SETERRQ(PetscObjectComm((PetscObject)user->dm), PETSC_ERR_SUP, "Differential filtering not available for chosen state variable"); + case STATEVAR_ENTROPY: + PetscCallCeed(ceed, CeedQFunctionCreateInterior(ceed, 1, DifferentialFilter_RHS_Entropy, DifferentialFilter_RHS_Entropy_loc, &qf_rhs)); + break; } if (diff_filter->do_mms_test) { PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_rhs)); @@ -69,6 +71,9 @@ PetscErrorCode DifferentialFilterCreateOperators(Ceed ceed, User user, CeedData PetscCall(PetscSNPrintf(field_name, PETSC_MAX_PATH_LEN, "v%" PetscInt_FMT, dm_field)); PetscCallCeed(ceed, CeedOperatorSetField(op_rhs, field_name, elem_restr_filter, basis_filter, CEED_VECTOR_ACTIVE)); + + PetscCallCeed(ceed, CeedElemRestrictionDestroy(&elem_restr_filter)); + PetscCallCeed(ceed, CeedBasisDestroy(&basis_filter)); } PetscCall(OperatorApplyContextCreate(user->dm, dm_filter, ceed, op_rhs, NULL, NULL, user->Q_loc, NULL, &diff_filter->op_rhs_ctx)); @@ -91,7 +96,7 @@ PetscErrorCode DifferentialFilterCreateOperators(Ceed ceed, User user, CeedData // -- Get Grid anisotropy tensor PetscCall(GridAnisotropyTensorCalculateCollocatedVector(ceed, user, ceed_data, &elem_restr_grid_aniso, &grid_aniso_ceed, &num_comp_grid_aniso)); - PetscCallCeed(ceed, CeedCompositeOperatorCreate(ceed, &op_lhs)); + PetscCallCeed(ceed, CeedOperatorCreateComposite(ceed, &op_lhs)); for (PetscInt i = 0; i < diff_filter->num_filtered_fields; i++) { CeedQFunction qf_lhs; PetscInt num_comp_filter = diff_filter->num_field_components[i]; @@ -132,8 +137,7 @@ PetscErrorCode DifferentialFilterCreateOperators(Ceed ceed, User user, CeedData char field_name[PETSC_MAX_PATH_LEN]; PetscCall(PetscSNPrintf(field_name, PETSC_MAX_PATH_LEN, "v%" PetscInt_FMT, i)); PetscCallCeed(ceed, CeedOperatorGetFieldByName(diff_filter->op_rhs_ctx->op, field_name, &op_field)); - PetscCallCeed(ceed, CeedOperatorFieldGetElemRestriction(op_field, &elem_restr_filter)); - PetscCallCeed(ceed, CeedOperatorFieldGetBasis(op_field, &basis_filter)); + PetscCallCeed(ceed, CeedOperatorFieldGetData(op_field, NULL, &elem_restr_filter, &basis_filter, NULL)); } PetscCallCeed(ceed, CeedOperatorCreate(ceed, qf_lhs, NULL, NULL, &op_lhs_sub)); @@ -145,12 +149,17 @@ PetscErrorCode DifferentialFilterCreateOperators(Ceed ceed, User user, CeedData PetscCallCeed(ceed, CeedOperatorSetField(op_lhs_sub, "v", elem_restr_filter, basis_filter, CEED_VECTOR_ACTIVE)); PetscCallCeed(ceed, CeedOperatorSetField(op_lhs_sub, "Grad_v", elem_restr_filter, basis_filter, CEED_VECTOR_ACTIVE)); - PetscCallCeed(ceed, CeedCompositeOperatorAddSub(op_lhs, op_lhs_sub)); + PetscCallCeed(ceed, CeedOperatorCompositeAddSub(op_lhs, op_lhs_sub)); + PetscCallCeed(ceed, CeedElemRestrictionDestroy(&elem_restr_filter)); + PetscCallCeed(ceed, CeedBasisDestroy(&basis_filter)); PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_lhs)); PetscCallCeed(ceed, CeedOperatorDestroy(&op_lhs_sub)); } + PetscCallCeed(ceed, CeedVectorDestroy(&grid_aniso_ceed)); + PetscCallCeed(ceed, CeedElemRestrictionDestroy(&elem_restr_grid_aniso)); + PetscCallCeed(ceed, CeedOperatorGetContextFieldLabel(op_lhs, "filter width scaling", &diff_filter->filter_width_scaling_label)); - PetscCall(MatCeedCreate(dm_filter, dm_filter, op_lhs, NULL, &mat_lhs)); + PetscCall(MatCreateCeed(dm_filter, dm_filter, op_lhs, NULL, &mat_lhs)); PetscCall(KSPCreate(PetscObjectComm((PetscObject)dm_filter), &diff_filter->ksp)); PetscCall(KSPSetOptionsPrefix(diff_filter->ksp, "diff_filter_")); @@ -264,9 +273,10 @@ PetscErrorCode DifferentialFilterSetup(Ceed ceed, User user, CeedData ceed_data, PetscCallCeed(ceed, CeedQFunctionContextCreate(ceed, &diff_filter_qfctx)); PetscCallCeed(ceed, CeedQFunctionContextSetData(diff_filter_qfctx, CEED_MEM_HOST, CEED_USE_POINTER, sizeof(*diff_filter_ctx), diff_filter_ctx)); PetscCallCeed(ceed, CeedQFunctionContextSetDataDestroy(diff_filter_qfctx, CEED_MEM_HOST, FreeContextPetsc)); - PetscCallCeed(ceed, CeedQFunctionContextRegisterDouble( - diff_filter_qfctx, "filter width scaling", offsetof(struct DifferentialFilterContext_, width_scaling), - sizeof(diff_filter_ctx->width_scaling) / sizeof(diff_filter_ctx->width_scaling[0]), "Filter width scaling")); + PetscCallCeed(ceed, CeedQFunctionContextRegisterDouble(diff_filter_qfctx, "filter width scaling", + offsetof(struct DifferentialFilterContext_, width_scaling), + sizeof(diff_filter_ctx->width_scaling) / sizeof(diff_filter_ctx->width_scaling[0]), + "Filter width scaling")); // -- Setup Operators PetscCall(DifferentialFilterCreateOperators(ceed, user, ceed_data, diff_filter_qfctx)); diff --git a/examples/fluids/src/dm_utils.c b/examples/fluids/src/dm_utils.c index 074240fbfc..b7a1bf8ea7 100644 --- a/examples/fluids/src/dm_utils.c +++ b/examples/fluids/src/dm_utils.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -67,8 +67,8 @@ PetscErrorCode DMPlexCeedElemRestrictionCreate(Ceed ceed, DM dm, DMLabel domain_ CeedInt *restriction_offsets_ceed = NULL; PetscFunctionBeginUser; - PetscCall( - DMPlexGetLocalOffsets(dm, domain_label, label_value, height, dm_field, &num_elem, &elem_size, &num_comp, &num_dof, &restriction_offsets_petsc)); + PetscCall(DMPlexGetLocalOffsets(dm, domain_label, label_value, height, dm_field, &num_elem, &elem_size, &num_comp, &num_dof, + &restriction_offsets_petsc)); PetscCall(IntArrayPetscToCeed(num_elem * elem_size, &restriction_offsets_petsc, &restriction_offsets_ceed)); PetscCallCeed(ceed, CeedElemRestrictionCreate(ceed, num_elem, elem_size, num_comp, 1, num_dof, CEED_MEM_HOST, CEED_COPY_VALUES, restriction_offsets_ceed, restriction)); diff --git a/examples/fluids/src/grid_anisotropy_tensor.c b/examples/fluids/src/grid_anisotropy_tensor.c index 15692ee7d6..8e5ffecb49 100644 --- a/examples/fluids/src/grid_anisotropy_tensor.c +++ b/examples/fluids/src/grid_anisotropy_tensor.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -75,7 +75,7 @@ PetscErrorCode GridAnisotropyTensorProjectionSetupApply(Ceed ceed, User user, Ce { // -- Setup KSP for L^2 projection Mat mat_mass; - PetscCall(MatCeedCreate(grid_aniso_proj->dm, grid_aniso_proj->dm, op_mass, NULL, &mat_mass)); + PetscCall(MatCreateCeed(grid_aniso_proj->dm, grid_aniso_proj->dm, op_mass, NULL, &mat_mass)); PetscCall(KSPCreate(comm, &ksp)); PetscCall(KSPSetOptionsPrefix(ksp, "grid_anisotropy_tensor_projection_")); diff --git a/examples/fluids/src/inverse_multiplicity.c b/examples/fluids/src/inverse_multiplicity.c index 2d71cc15fe..0e8cb90cbf 100644 --- a/examples/fluids/src/inverse_multiplicity.c +++ b/examples/fluids/src/inverse_multiplicity.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause diff --git a/examples/fluids/src/log_events.c b/examples/fluids/src/log_events.c new file mode 100644 index 0000000000..d67b312250 --- /dev/null +++ b/examples/fluids/src/log_events.c @@ -0,0 +1,36 @@ +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. +// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +// +// SPDX-License-Identifier: BSD-2-Clause +// +// This file is part of CEED: http://github.com/ceed + +#include +#include + +static PetscClassId libCEED_classid, misc_classid; + +PetscLogEvent FLUIDS_CeedOperatorApply; +PetscLogEvent FLUIDS_CeedOperatorAssemble; +PetscLogEvent FLUIDS_CeedOperatorAssembleDiagonal; +PetscLogEvent FLUIDS_CeedOperatorAssemblePointBlockDiagonal; +PetscLogEvent FLUIDS_SmartRedis_Init; +PetscLogEvent FLUIDS_SmartRedis_Meta; +PetscLogEvent FLUIDS_SmartRedis_Train; +PetscLogEvent FLUIDS_TrainDataCompute; +PetscLogEvent FLUIDS_DifferentialFilter; +PetscLogEvent FLUIDS_VelocityGradientProjection; + +PetscErrorCode RegisterLogEvents() { + PetscFunctionBeginUser; + PetscCall(PetscClassIdRegister("libCEED", &libCEED_classid)); + PetscCall(PetscLogEventRegister("CeedOpApply", libCEED_classid, &FLUIDS_CeedOperatorApply)); + PetscCall(PetscLogEventRegister("CeedOpAsm", libCEED_classid, &FLUIDS_CeedOperatorAssemble)); + PetscCall(PetscLogEventRegister("CeedOpAsmD", libCEED_classid, &FLUIDS_CeedOperatorAssembleDiagonal)); + PetscCall(PetscLogEventRegister("CeedOpAsmPBD", libCEED_classid, &FLUIDS_CeedOperatorAssemblePointBlockDiagonal)); + + PetscCall(PetscClassIdRegister("Miscellaneous", &misc_classid)); + PetscCall(PetscLogEventRegister("DiffFilter", misc_classid, &FLUIDS_DifferentialFilter)); + PetscCall(PetscLogEventRegister("VeloGradProj", misc_classid, &FLUIDS_VelocityGradientProjection)); + PetscFunctionReturn(PETSC_SUCCESS); +} diff --git a/examples/fluids/src/mat-ceed.c b/examples/fluids/src/mat-ceed.c index 246df8779c..5e8ebef86f 100644 --- a/examples/fluids/src/mat-ceed.c +++ b/examples/fluids/src/mat-ceed.c @@ -1,17 +1,23 @@ /// @file -/// MatCeed and it's related operators +/// MatCEED implementation -#include #include #include #include #include -#include +#include +#include +#include +#include +#include +#include #include #include PetscClassId MATCEED_CLASSID; -PetscLogEvent MATCEED_MULT, MATCEED_MULT_TRANSPOSE; +PetscLogEvent MATCEED_MULT, MATCEED_MULT_CEEDOP, MATCEED_MULT_TRANSPOSE, MATCEED_MULT_TRANSPOSE_CEEDOP, MATCEED_ASSEMBLE_DIAGONAL, + MATCEED_ASSEMBLE_DIAGONAL_CEEDOP, MATCEED_SETUP_PBDIAGONAL, MATCEED_SETUP_PBDIAGONAL_CEEDOP, MATCEED_ASSEMBLE_PBDIAGONAL, + MATCEED_ASSEMBLE_PBDIAGONAL_CEEDOP, MATCEED_SETUP_FULL, MATCEED_SETUP_FULL_CEEDOP, MATCEED_ASSEMBLE_FULL, MATCEED_ASSEMBLE_FULL_CEEDOP; /** @brief Register MATCEED log events. @@ -21,67 +27,25 @@ PetscLogEvent MATCEED_MULT, MATCEED_MULT_TRANSPOSE; @return An error code: 0 - success, otherwise - failure **/ static PetscErrorCode MatCeedRegisterLogEvents() { - static bool registered = false; + static PetscBool registered = PETSC_FALSE; PetscFunctionBeginUser; if (registered) PetscFunctionReturn(PETSC_SUCCESS); - PetscCall(PetscClassIdRegister("MATCEED", &MATCEED_CLASSID)); - PetscCall(PetscLogEventRegister("MATCEED Mult", MATCEED_CLASSID, &MATCEED_MULT)); - PetscCall(PetscLogEventRegister("MATCEED Mult Transpose", MATCEED_CLASSID, &MATCEED_MULT_TRANSPOSE)); - registered = true; - PetscFunctionReturn(PETSC_SUCCESS); -} - -/** - @brief Setup inner `Mat` for `PC` operations not directly supported by libCEED. - - Collective across MPI processes. - - @param[in] mat_ceed `MATCEED` to setup - @param[out] mat_inner Inner `Mat` - - @return An error code: 0 - success, otherwise - failure -**/ -static PetscErrorCode MatCeedSetupInnerMat(Mat mat_ceed, Mat *mat_inner) { - MatCeedContext ctx; - - PetscFunctionBeginUser; - PetscCall(MatShellGetContext(mat_ceed, &ctx)); - - PetscCheck(ctx->dm_x == ctx->dm_y, PetscObjectComm((PetscObject)mat_ceed), PETSC_ERR_SUP, "PC only supported for MATCEED on a single DM"); - - // Check cl mat type - { - PetscBool is_internal_mat_type_cl = PETSC_FALSE; - char internal_mat_type_cl[64]; - - // Check for specific CL inner mat type for this Mat - { - const char *mat_ceed_prefix = NULL; - - PetscCall(MatGetOptionsPrefix(mat_ceed, &mat_ceed_prefix)); - PetscOptionsBegin(PetscObjectComm((PetscObject)mat_ceed), mat_ceed_prefix, "", NULL); - PetscCall(PetscOptionsFList("-ceed_inner_mat_type", "MATCEED inner assembled MatType for PC support", NULL, MatList, internal_mat_type_cl, - internal_mat_type_cl, sizeof(internal_mat_type_cl), &is_internal_mat_type_cl)); - PetscOptionsEnd(); - if (is_internal_mat_type_cl) { - PetscCall(PetscFree(ctx->internal_mat_type)); - PetscCall(PetscStrallocpy(internal_mat_type_cl, &ctx->internal_mat_type)); - } - } - } - - // Create sparse matrix - { - MatType dm_mat_type, dm_mat_type_copy; - - PetscCall(DMGetMatType(ctx->dm_x, &dm_mat_type)); - PetscCall(PetscStrallocpy(dm_mat_type, (char **)&dm_mat_type_copy)); - PetscCall(DMSetMatType(ctx->dm_x, ctx->internal_mat_type)); - PetscCall(DMCreateMatrix(ctx->dm_x, mat_inner)); - PetscCall(DMSetMatType(ctx->dm_x, dm_mat_type_copy)); - PetscCall(PetscFree(dm_mat_type_copy)); - } + PetscCall(PetscClassIdRegister("MatCEED", &MATCEED_CLASSID)); + PetscCall(PetscLogEventRegister("MatCEEDMul", MATCEED_CLASSID, &MATCEED_MULT)); + PetscCall(PetscLogEventRegister("MatCEEDMulCeed", MATCEED_CLASSID, &MATCEED_MULT_CEEDOP)); + PetscCall(PetscLogEventRegister("MatCEEDMulT", MATCEED_CLASSID, &MATCEED_MULT_TRANSPOSE)); + PetscCall(PetscLogEventRegister("MatCEEDMulTCeed", MATCEED_CLASSID, &MATCEED_MULT_TRANSPOSE_CEEDOP)); + PetscCall(PetscLogEventRegister("MatCEEDAsmDiag", MATCEED_CLASSID, &MATCEED_ASSEMBLE_DIAGONAL)); + PetscCall(PetscLogEventRegister("MatCEEDAsmPBDSU", MATCEED_CLASSID, &MATCEED_SETUP_PBDIAGONAL)); + PetscCall(PetscLogEventRegister("MatCEEDAsmPBDSUCeed", MATCEED_CLASSID, &MATCEED_SETUP_PBDIAGONAL_CEEDOP)); + PetscCall(PetscLogEventRegister("MatCEEDAsmPBD", MATCEED_CLASSID, &MATCEED_ASSEMBLE_PBDIAGONAL)); + PetscCall(PetscLogEventRegister("MatCEEDAsmPBDCeed", MATCEED_CLASSID, &MATCEED_ASSEMBLE_PBDIAGONAL_CEEDOP)); + PetscCall(PetscLogEventRegister("MatCEEDAsmSU", MATCEED_CLASSID, &MATCEED_SETUP_FULL)); + PetscCall(PetscLogEventRegister("MatCEEDAsmSUCeed", MATCEED_CLASSID, &MATCEED_SETUP_FULL_CEEDOP)); + PetscCall(PetscLogEventRegister("MatCEEDAsm", MATCEED_CLASSID, &MATCEED_ASSEMBLE_FULL)); + PetscCall(PetscLogEventRegister("MatCEEDAsmCeed", MATCEED_CLASSID, &MATCEED_ASSEMBLE_FULL_CEEDOP)); + registered = PETSC_TRUE; PetscFunctionReturn(PETSC_SUCCESS); } @@ -117,12 +81,15 @@ static PetscErrorCode MatCeedAssemblePointBlockDiagonalCOO(Mat mat_ceed, Mat mat PetscLogStage stage_amg_setup; // -- Assemble sparsity pattern if mat hasn't been assembled before - PetscCall(PetscLogStageGetId("MATCEED Assembly Setup", &stage_amg_setup)); + PetscCall(PetscLogStageGetId("MatCEED Asm Setup", &stage_amg_setup)); if (stage_amg_setup == -1) { - PetscCall(PetscLogStageRegister("MATCEED Assembly Setup", &stage_amg_setup)); + PetscCall(PetscLogStageRegister("MatCEED Asm Setup", &stage_amg_setup)); } PetscCall(PetscLogStagePush(stage_amg_setup)); + PetscCall(PetscLogEventBegin(MATCEED_SETUP_PBDIAGONAL, mat_ceed, mat_coo, NULL, NULL)); + PetscCall(PetscLogEventBegin(MATCEED_SETUP_PBDIAGONAL_CEEDOP, mat_ceed, mat_coo, NULL, NULL)); PetscCallCeed(ctx->ceed, CeedOperatorLinearAssemblePointBlockDiagonalSymbolic(ctx->op_mult, &num_entries, &rows_ceed, &cols_ceed)); + PetscCall(PetscLogEventEnd(MATCEED_SETUP_PBDIAGONAL_CEEDOP, mat_ceed, mat_coo, NULL, NULL)); PetscCall(IntArrayCeedToPetsc(num_entries, &rows_ceed, &rows_petsc)); PetscCall(IntArrayCeedToPetsc(num_entries, &cols_ceed, &cols_petsc)); PetscCall(MatSetPreallocationCOOLocal(mat_coo, num_entries, rows_petsc, cols_petsc)); @@ -131,11 +98,13 @@ static PetscErrorCode MatCeedAssemblePointBlockDiagonalCOO(Mat mat_ceed, Mat mat if (!ctx->coo_values_pbd) PetscCallCeed(ctx->ceed, CeedVectorCreate(ctx->ceed, num_entries, &ctx->coo_values_pbd)); PetscCall(PetscRealloc(++ctx->num_mats_assembled_pbd * sizeof(Mat), &ctx->mats_assembled_pbd)); ctx->mats_assembled_pbd[ctx->num_mats_assembled_pbd - 1] = mat_coo; + PetscCall(PetscLogEventEnd(MATCEED_SETUP_PBDIAGONAL, mat_ceed, mat_coo, NULL, NULL)); PetscCall(PetscLogStagePop()); } } // Assemble mat_ceed + PetscCall(PetscLogEventBegin(MATCEED_ASSEMBLE_PBDIAGONAL, mat_ceed, mat_coo, NULL, NULL)); PetscCall(MatAssemblyBegin(mat_coo, MAT_FINAL_ASSEMBLY)); { const CeedScalar *values; @@ -148,7 +117,9 @@ static PetscErrorCode MatCeedAssemblePointBlockDiagonalCOO(Mat mat_ceed, Mat mat else if (strstr(mat_type, "kokkos")) mem_type = CEED_MEM_DEVICE; else mem_type = CEED_MEM_HOST; + PetscCall(PetscLogEventBegin(MATCEED_ASSEMBLE_PBDIAGONAL_CEEDOP, mat_ceed, mat_coo, NULL, NULL)); PetscCallCeed(ctx->ceed, CeedOperatorLinearAssemblePointBlockDiagonal(ctx->op_mult, ctx->coo_values_pbd, CEED_REQUEST_IMMEDIATE)); + PetscCall(PetscLogEventEnd(MATCEED_ASSEMBLE_PBDIAGONAL_CEEDOP, mat_ceed, mat_coo, NULL, NULL)); PetscCallCeed(ctx->ceed, CeedVectorGetArrayRead(ctx->coo_values_pbd, mem_type, &values)); PetscCall(MatSetValuesCOO(mat_coo, values, INSERT_VALUES)); PetscCall(MatIsSPDKnown(mat_ceed, &is_spd_known, &is_spd)); @@ -156,6 +127,7 @@ static PetscErrorCode MatCeedAssemblePointBlockDiagonalCOO(Mat mat_ceed, Mat mat PetscCallCeed(ctx->ceed, CeedVectorRestoreArrayRead(ctx->coo_values_pbd, &values)); } PetscCall(MatAssemblyEnd(mat_coo, MAT_FINAL_ASSEMBLY)); + PetscCall(PetscLogEventEnd(MATCEED_ASSEMBLE_PBDIAGONAL, mat_ceed, mat_coo, NULL, NULL)); PetscFunctionReturn(PETSC_SUCCESS); } @@ -177,14 +149,14 @@ static PetscErrorCode MatCeedAssembleInnerBlockDiagonalMat(Mat mat_ceed, PetscBo PetscCall(MatShellGetContext(mat_ceed, &ctx)); if (use_ceed_pbd) { // Check if COO pattern set - if (!ctx->mat_assembled_pbd_internal) PetscCall(MatCeedSetupInnerMat(mat_ceed, &ctx->mat_assembled_pbd_internal)); + if (!ctx->mat_assembled_pbd_internal) PetscCall(MatCeedCreateMatCOO(mat_ceed, &ctx->mat_assembled_pbd_internal)); // Assemble mat_assembled_full_internal PetscCall(MatCeedAssemblePointBlockDiagonalCOO(mat_ceed, ctx->mat_assembled_pbd_internal)); if (mat_inner) *mat_inner = ctx->mat_assembled_pbd_internal; } else { // Check if COO pattern set - if (!ctx->mat_assembled_full_internal) PetscCall(MatCeedSetupInnerMat(mat_ceed, &ctx->mat_assembled_full_internal)); + if (!ctx->mat_assembled_full_internal) PetscCall(MatCeedCreateMatCOO(mat_ceed, &ctx->mat_assembled_full_internal)); // Assemble mat_assembled_full_internal PetscCall(MatCeedAssembleCOO(mat_ceed, ctx->mat_assembled_full_internal)); @@ -194,79 +166,134 @@ static PetscErrorCode MatCeedAssembleInnerBlockDiagonalMat(Mat mat_ceed, PetscBo } /** - @brief Get `MATCEED` diagonal block for Jacobi. + @brief Get `MATCEED` variable block diagonal for Jacobi. Collective across MPI processes. @param[in] mat_ceed `MATCEED` to invert - @param[out] mat_block The diagonal block matrix + @param[out] mat_vblock The variable diagonal block matrix @return An error code: 0 - success, otherwise - failure **/ -static PetscErrorCode MatGetDiagonalBlock_Ceed(Mat mat_ceed, Mat *mat_block) { - Mat mat_inner = NULL; +static PetscErrorCode MatGetVariableBlockDiagonal_Ceed(Mat mat_ceed, Mat *mat_vblock) { MatCeedContext ctx; PetscFunctionBeginUser; PetscCall(MatShellGetContext(mat_ceed, &ctx)); // Assemble inner mat if needed - PetscCall(MatCeedAssembleInnerBlockDiagonalMat(mat_ceed, ctx->is_ceed_pbd_valid, &mat_inner)); - - // Get block diagonal - PetscCall(MatGetDiagonalBlock(mat_inner, mat_block)); + PetscCall(MatCeedAssembleInnerBlockDiagonalMat(mat_ceed, ctx->is_ceed_vpbd_valid, mat_vblock)); + PetscCall(PetscObjectReference((PetscObject)*mat_vblock)); PetscFunctionReturn(PETSC_SUCCESS); } /** - @brief Invert `MATCEED` diagonal block for Jacobi. + @brief Get `MATCEED` block diagonal for Jacobi. Collective across MPI processes. - @param[in] mat_ceed `MATCEED` to invert - @param[out] values The block inverses in column major order + @param[in] mat_ceed `MATCEED` to invert + @param[out] mat_block The variable diagonal block matrix @return An error code: 0 - success, otherwise - failure **/ -static PetscErrorCode MatInvertBlockDiagonal_Ceed(Mat mat_ceed, const PetscScalar **values) { - Mat mat_inner = NULL; +static PetscErrorCode MatGetBlockDiagonal_Ceed(Mat mat_ceed, Mat *mat_block) { MatCeedContext ctx; PetscFunctionBeginUser; PetscCall(MatShellGetContext(mat_ceed, &ctx)); // Assemble inner mat if needed - PetscCall(MatCeedAssembleInnerBlockDiagonalMat(mat_ceed, ctx->is_ceed_pbd_valid, &mat_inner)); - - // Invert PB diagonal - PetscCall(MatInvertBlockDiagonal(mat_inner, values)); + PetscCall(MatCeedAssembleInnerBlockDiagonalMat(mat_ceed, ctx->is_ceed_pbd_valid, mat_block)); + PetscCall(PetscObjectReference((PetscObject)*mat_block)); PetscFunctionReturn(PETSC_SUCCESS); } /** - @brief Invert `MATCEED` variable diagonal block for Jacobi. + @brief Get on-process diagonal block of `MATCEED` + + This is a block per-process of the diagonal of the global matrix. + This is NOT the diagonal blocks associated with the block size of the matrix (i.e. `MatSetBlockSize()` has no effect on this function). Collective across MPI processes. - @param[in] mat_ceed `MATCEED` to invert - @param[in] num_blocks The number of blocks on the process - @param[in] block_sizes The size of each block on the process - @param[out] values The block inverses in column major order + @param[in] mat_ceed `MATCEED` to invert + @param[out] mat_block The diagonal block matrix @return An error code: 0 - success, otherwise - failure **/ -static PetscErrorCode MatInvertVariableBlockDiagonal_Ceed(Mat mat_ceed, PetscInt num_blocks, const PetscInt *block_sizes, PetscScalar *values) { - Mat mat_inner = NULL; +static PetscErrorCode MatGetDiagonalBlock_Ceed(Mat mat_ceed, Mat *mat_block) { MatCeedContext ctx; PetscFunctionBeginUser; PetscCall(MatShellGetContext(mat_ceed, &ctx)); - // Assemble inner mat if needed - PetscCall(MatCeedAssembleInnerBlockDiagonalMat(mat_ceed, ctx->is_ceed_vpbd_valid, &mat_inner)); + // Check if COO pattern set + if (!ctx->mat_assembled_full_internal) PetscCall(MatCeedCreateMatCOO(mat_ceed, &ctx->mat_assembled_full_internal)); + + // Assemble mat_assembled_full_internal + PetscCall(MatCeedAssembleCOO(mat_ceed, ctx->mat_assembled_full_internal)); + + // Get diagonal block + PetscCall(MatGetDiagonalBlock(ctx->mat_assembled_full_internal, mat_block)); + PetscFunctionReturn(PETSC_SUCCESS); +} + +/** + @brief View `MATCEED`. + + Collective across MPI processes. + + @param[in] mat_ceed `MATCEED` to view + @param[in] viewer The visualization context - // Invert PB diagonal - PetscCall(MatInvertVariableBlockDiagonal(mat_inner, num_blocks, block_sizes, values)); + @return An error code: 0 - success, otherwise - failure +**/ +static PetscErrorCode MatView_Ceed(Mat mat_ceed, PetscViewer viewer) { + PetscBool is_ascii; + PetscViewerFormat format; + PetscMPIInt size, rank; + MatCeedContext ctx; + + PetscFunctionBeginUser; + PetscValidHeaderSpecific(viewer, PETSC_VIEWER_CLASSID, 2); + PetscCall(MatShellGetContext(mat_ceed, &ctx)); + if (!viewer) PetscCall(PetscViewerASCIIGetStdout(PetscObjectComm((PetscObject)mat_ceed), &viewer)); + + PetscCall(PetscViewerGetFormat(viewer, &format)); + PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)mat_ceed), &size)); + if (size == 1 && format == PETSC_VIEWER_LOAD_BALANCE) PetscFunctionReturn(PETSC_SUCCESS); + + PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)mat_ceed), &rank)); + if (rank != 0) PetscFunctionReturn(PETSC_SUCCESS); + + PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &is_ascii)); + { + PetscBool is_detailed = format == PETSC_VIEWER_ASCII_INFO_DETAIL; + char rank_string[16] = {'\0'}; + FILE *file; + + PetscCall(PetscViewerASCIIPrintf(viewer, "MatCEED:\n")); + PetscCall(PetscViewerASCIIPushTab(viewer)); // MatCEED + PetscCall(PetscViewerASCIIPrintf(viewer, "Default COO MatType: %s\n", ctx->coo_mat_type)); + PetscCall(PetscSNPrintf(rank_string, 16, "on Rank %d", rank)); + PetscCall(PetscViewerASCIIPrintf(viewer, "CeedOperator Apply %s:\n", is_detailed ? rank_string : "Summary")); + PetscCall(PetscViewerASCIIPrintf(viewer, "libCEED PB Diagonal Assembly: %s\n", ctx->is_ceed_pbd_valid ? "True" : "False")); + PetscCall(PetscViewerASCIIPrintf(viewer, "libCEED VPB Diagonal Assembly: %s\n", ctx->is_ceed_vpbd_valid ? "True" : "False")); + PetscCall(PetscViewerASCIIGetPointer(viewer, &file)); + PetscCall(PetscViewerASCIIPushTab(viewer)); // CeedOperator + if (is_detailed) PetscCallCeed(ctx->ceed, CeedOperatorView(ctx->op_mult, file)); + else PetscCallCeed(ctx->ceed, CeedOperatorViewTerse(ctx->op_mult, file)); + PetscCall(PetscViewerASCIIPopTab(viewer)); // CeedOperator + if (ctx->op_mult_transpose) { + PetscCall(PetscViewerASCIIPrintf(viewer, "CeedOperator ApplyTranspose %s:\n", is_detailed ? rank_string : "Summary")); + PetscCall(PetscViewerASCIIPushTab(viewer)); // CeedOperator + if (is_detailed) PetscCallCeed(ctx->ceed, CeedOperatorView(ctx->op_mult_transpose, file)); + else PetscCallCeed(ctx->ceed, CeedOperatorViewTerse(ctx->op_mult_transpose, file)); + PetscCall(PetscViewerASCIIPopTab(viewer)); // CeedOperator + } + PetscCall(PetscViewerASCIIPopTab(viewer)); // MatCEED + } PetscFunctionReturn(PETSC_SUCCESS); } @@ -287,7 +314,7 @@ static PetscErrorCode MatInvertVariableBlockDiagonal_Ceed(Mat mat_ceed, PetscInt @return An error code: 0 - success, otherwise - failure **/ -PetscErrorCode MatCeedCreate(DM dm_x, DM dm_y, CeedOperator op_mult, CeedOperator op_mult_transpose, Mat *mat) { +PetscErrorCode MatCreateCeed(DM dm_x, DM dm_y, CeedOperator op_mult, CeedOperator op_mult_transpose, Mat *mat) { PetscInt X_l_size, X_g_size, Y_l_size, Y_g_size; VecType vec_type; MatCeedContext ctx; @@ -317,6 +344,7 @@ PetscErrorCode MatCeedCreate(DM dm_x, DM dm_y, CeedOperator op_mult, CeedOperato Y_g_size = X_g_size; Y_l_size = X_l_size; } + // Create context { Vec X_loc, Y_loc_transpose = NULL; @@ -327,7 +355,8 @@ PetscErrorCode MatCeedCreate(DM dm_x, DM dm_y, CeedOperator op_mult, CeedOperato PetscCall(DMCreateLocalVector(dm_y, &Y_loc_transpose)); PetscCall(VecZeroEntries(Y_loc_transpose)); } - PetscCall(MatCeedContextCreate(dm_x, dm_y, X_loc, Y_loc_transpose, op_mult, op_mult_transpose, MATCEED_MULT, MATCEED_MULT_TRANSPOSE, &ctx)); + PetscCall(MatCeedContextCreate(dm_x, dm_y, X_loc, Y_loc_transpose, op_mult, op_mult_transpose, MATCEED_MULT, MATCEED_MULT_TRANSPOSE, + MATCEED_MULT_CEEDOP, MATCEED_MULT_TRANSPOSE_CEEDOP, &ctx)); PetscCall(VecDestroy(&X_loc)); PetscCall(VecDestroy(&Y_loc_transpose)); } @@ -377,8 +406,8 @@ PetscErrorCode MatCeedCreate(DM dm_x, DM dm_y, CeedOperator op_mult, CeedOperato CeedInt num_sub_operators; CeedOperator *sub_operators; - PetscCallCeed(ctx->ceed, CeedCompositeOperatorGetNumSub(op_mult, &num_sub_operators)); - PetscCallCeed(ctx->ceed, CeedCompositeOperatorGetSubList(op_mult, &sub_operators)); + PetscCallCeed(ctx->ceed, CeedOperatorCompositeGetNumSub(op_mult, &num_sub_operators)); + PetscCallCeed(ctx->ceed, CeedOperatorCompositeGetSubList(op_mult, &sub_operators)); for (CeedInt i = 0; i < num_sub_operators; i++) { CeedInt num_bases, num_comp; CeedBasis *active_bases; @@ -428,22 +457,23 @@ PetscErrorCode MatCeedCreate(DM dm_x, DM dm_y, CeedOperator op_mult, CeedOperato // -- Set internal mat type { VecType vec_type; - MatType internal_mat_type = MATAIJ; + MatType coo_mat_type; PetscCall(VecGetType(ctx->X_loc, &vec_type)); - if (strstr(vec_type, VECCUDA)) internal_mat_type = MATAIJCUSPARSE; - else if (strstr(vec_type, VECKOKKOS)) internal_mat_type = MATAIJKOKKOS; - else internal_mat_type = MATAIJ; - PetscCall(PetscStrallocpy(internal_mat_type, &ctx->internal_mat_type)); + if (strstr(vec_type, VECCUDA)) coo_mat_type = MATAIJCUSPARSE; + else if (strstr(vec_type, VECKOKKOS)) coo_mat_type = MATAIJKOKKOS; + else coo_mat_type = MATAIJ; + PetscCall(PetscStrallocpy(coo_mat_type, &ctx->coo_mat_type)); } // -- Set mat operations - PetscCall(MatShellSetContextDestroy(*mat, (PetscErrorCode(*)(void *))MatCeedContextDestroy)); + PetscCall(MatShellSetContextDestroy(*mat, (PetscCtxDestroyFn *)MatCeedContextDestroy)); + PetscCall(MatShellSetOperation(*mat, MATOP_VIEW, (void (*)(void))MatView_Ceed)); PetscCall(MatShellSetOperation(*mat, MATOP_MULT, (void (*)(void))MatMult_Ceed)); if (op_mult_transpose) PetscCall(MatShellSetOperation(*mat, MATOP_MULT_TRANSPOSE, (void (*)(void))MatMultTranspose_Ceed)); PetscCall(MatShellSetOperation(*mat, MATOP_GET_DIAGONAL, (void (*)(void))MatGetDiagonal_Ceed)); PetscCall(MatShellSetOperation(*mat, MATOP_GET_DIAGONAL_BLOCK, (void (*)(void))MatGetDiagonalBlock_Ceed)); - PetscCall(MatShellSetOperation(*mat, MATOP_INVERT_BLOCK_DIAGONAL, (void (*)(void))MatInvertBlockDiagonal_Ceed)); - PetscCall(MatShellSetOperation(*mat, MATOP_INVERT_VBLOCK_DIAGONAL, (void (*)(void))MatInvertVariableBlockDiagonal_Ceed)); + PetscCall(MatShellSetOperation(*mat, MATOP_GET_BLOCK_DIAGONAL, (void (*)(void))MatGetBlockDiagonal_Ceed)); + PetscCall(MatShellSetOperation(*mat, MATOP_GET_VBLOCK_DIAGONAL, (void (*)(void))MatGetVariableBlockDiagonal_Ceed)); PetscCall(MatShellSetVecType(*mat, vec_type)); PetscFunctionReturn(PETSC_SUCCESS); } @@ -464,13 +494,16 @@ PetscErrorCode MatCeedCopy(Mat mat_ceed, Mat mat_other) { // Check type compatibility { - MatType mat_type_ceed, mat_type_other; + PetscBool is_matceed = PETSC_FALSE, is_matshell = PETSC_FALSE; + MatType mat_type_ceed, mat_type_other; PetscCall(MatGetType(mat_ceed, &mat_type_ceed)); - PetscCheck(!strcmp(mat_type_ceed, MATCEED), PETSC_COMM_SELF, PETSC_ERR_LIB, "mat_ceed must have type " MATCEED); - PetscCall(MatGetType(mat_ceed, &mat_type_other)); - PetscCheck(!strcmp(mat_type_other, MATCEED) || !strcmp(mat_type_other, MATSHELL), PETSC_COMM_SELF, PETSC_ERR_LIB, - "mat_other must have type " MATCEED " or " MATSHELL); + PetscCall(PetscStrcmp(mat_type_ceed, MATCEED, &is_matceed)); + PetscCheck(is_matceed, PETSC_COMM_SELF, PETSC_ERR_LIB, "mat_ceed must have type " MATCEED); + PetscCall(MatGetType(mat_other, &mat_type_other)); + PetscCall(PetscStrcmp(mat_type_other, MATCEED, &is_matceed)); + PetscCall(PetscStrcmp(mat_type_other, MATSHELL, &is_matceed)); + PetscCheck(is_matceed || is_matshell, PETSC_COMM_SELF, PETSC_ERR_LIB, "mat_other must have type " MATCEED " or " MATSHELL); } // Check dimension compatibility @@ -499,13 +532,14 @@ PetscErrorCode MatCeedCopy(Mat mat_ceed, Mat mat_other) { PetscCall(MatShellGetContext(mat_ceed, &ctx)); PetscCall(MatCeedContextReference(ctx)); PetscCall(MatShellSetContext(mat_other, ctx)); - PetscCall(MatShellSetContextDestroy(mat_other, (PetscErrorCode(*)(void *))MatCeedContextDestroy)); + PetscCall(MatShellSetContextDestroy(mat_other, (PetscCtxDestroyFn *)MatCeedContextDestroy)); + PetscCall(MatShellSetOperation(mat_other, MATOP_VIEW, (void (*)(void))MatView_Ceed)); PetscCall(MatShellSetOperation(mat_other, MATOP_MULT, (void (*)(void))MatMult_Ceed)); if (ctx->op_mult_transpose) PetscCall(MatShellSetOperation(mat_other, MATOP_MULT_TRANSPOSE, (void (*)(void))MatMultTranspose_Ceed)); PetscCall(MatShellSetOperation(mat_other, MATOP_GET_DIAGONAL, (void (*)(void))MatGetDiagonal_Ceed)); PetscCall(MatShellSetOperation(mat_other, MATOP_GET_DIAGONAL_BLOCK, (void (*)(void))MatGetDiagonalBlock_Ceed)); - PetscCall(MatShellSetOperation(mat_other, MATOP_INVERT_BLOCK_DIAGONAL, (void (*)(void))MatInvertBlockDiagonal_Ceed)); - PetscCall(MatShellSetOperation(mat_other, MATOP_INVERT_VBLOCK_DIAGONAL, (void (*)(void))MatInvertVariableBlockDiagonal_Ceed)); + PetscCall(MatShellSetOperation(mat_other, MATOP_GET_BLOCK_DIAGONAL, (void (*)(void))MatGetBlockDiagonal_Ceed)); + PetscCall(MatShellSetOperation(mat_other, MATOP_GET_VBLOCK_DIAGONAL, (void (*)(void))MatGetVariableBlockDiagonal_Ceed)); { PetscInt block_size; @@ -525,6 +559,132 @@ PetscErrorCode MatCeedCopy(Mat mat_ceed, Mat mat_other) { PetscFunctionReturn(PETSC_SUCCESS); } +/** + @brief Mark `CeedQFunction` data as updated and the `CeedQFunction` as requiring re-assembly for a `MatCEED`. + + Collective across MPI processes. + + @param[in] mat_ceed `MATCEED` + @param[out] update_needed Boolean flag indicating `CeedQFunction` update needed + + @return An error code: 0 - success, otherwise - failure +**/ +PetscErrorCode MatCeedSetAssemblyDataUpdateNeeded(Mat mat_ceed, PetscBool update_needed) { + MatCeedContext ctx; + + PetscFunctionBeginUser; + PetscCall(MatShellGetContext(mat_ceed, &ctx)); + PetscCallCeed(ctx->ceed, CeedOperatorSetQFunctionAssemblyDataUpdateNeeded(ctx->op_mult, update_needed)); + if (ctx->op_mult_transpose) { + PetscCallCeed(ctx->ceed, CeedOperatorSetQFunctionAssemblyDataUpdateNeeded(ctx->op_mult_transpose, update_needed)); + } + if (update_needed) { + PetscCall(MatAssemblyBegin(mat_ceed, MAT_FINAL_ASSEMBLY)); + PetscCall(MatAssemblyEnd(mat_ceed, MAT_FINAL_ASSEMBLY)); + } + PetscFunctionReturn(PETSC_SUCCESS); +} + +/** + @brief Setup a `Mat` with the same COO pattern as a `MatCEED`. + + Collective across MPI processes. + + @param[in] mat_ceed `MATCEED` + @param[out] mat_coo Sparse `Mat` with same COO pattern + + @return An error code: 0 - success, otherwise - failure +**/ +PetscErrorCode MatCeedCreateMatCOO(Mat mat_ceed, Mat *mat_coo) { + MatCeedContext ctx; + + PetscFunctionBeginUser; + PetscCall(MatShellGetContext(mat_ceed, &ctx)); + + PetscCheck(ctx->dm_x == ctx->dm_y, PetscObjectComm((PetscObject)mat_ceed), PETSC_ERR_SUP, "COO assembly only supported for MATCEED on a single DM"); + + // Check cl mat type + { + PetscBool is_coo_mat_type_cl = PETSC_FALSE; + char coo_mat_type_cl[64]; + + // Check for specific CL coo mat type for this Mat + { + const char *mat_ceed_prefix = NULL; + + PetscCall(MatGetOptionsPrefix(mat_ceed, &mat_ceed_prefix)); + PetscOptionsBegin(PetscObjectComm((PetscObject)mat_ceed), mat_ceed_prefix, "", NULL); + PetscCall(PetscOptionsFList("-ceed_coo_mat_type", "Default MATCEED COO assembly MatType", NULL, MatList, coo_mat_type_cl, coo_mat_type_cl, + sizeof(coo_mat_type_cl), &is_coo_mat_type_cl)); + PetscOptionsEnd(); + if (is_coo_mat_type_cl) { + PetscCall(PetscFree(ctx->coo_mat_type)); + PetscCall(PetscStrallocpy(coo_mat_type_cl, &ctx->coo_mat_type)); + } + } + } + + // Create sparse matrix + { + MatType dm_mat_type, dm_mat_type_copy; + + PetscCall(DMGetMatType(ctx->dm_x, &dm_mat_type)); + PetscCall(PetscStrallocpy(dm_mat_type, (char **)&dm_mat_type_copy)); + PetscCall(DMSetMatType(ctx->dm_x, ctx->coo_mat_type)); + PetscCall(DMCreateMatrix(ctx->dm_x, mat_coo)); + PetscCall(DMSetMatType(ctx->dm_x, dm_mat_type_copy)); + PetscCall(PetscFree(dm_mat_type_copy)); + } + PetscFunctionReturn(PETSC_SUCCESS); +} + +/** + @brief Setup the COO preallocation `MATCEED` into a `MATAIJ` or similar. + The caller is responsible for assuring the global and local sizes are compatible, otherwise this function will fail. + + Collective across MPI processes. + + @param[in] mat_ceed `MATCEED` to assemble + @param[in,out] mat_coo `MATAIJ` or similar to assemble into + + @return An error code: 0 - success, otherwise - failure +**/ +PetscErrorCode MatCeedSetPreallocationCOO(Mat mat_ceed, Mat mat_coo) { + MatCeedContext ctx; + + PetscFunctionBeginUser; + PetscCall(MatShellGetContext(mat_ceed, &ctx)); + + { + PetscInt *rows_petsc = NULL, *cols_petsc = NULL; + CeedInt *rows_ceed, *cols_ceed; + PetscCount num_entries; + PetscLogStage stage_amg_setup; + + // -- Assemble sparsity pattern if mat hasn't been assembled before + PetscCall(PetscLogStageGetId("MatCEED Asm Setup", &stage_amg_setup)); + if (stage_amg_setup == -1) { + PetscCall(PetscLogStageRegister("MatCEED Asm Setup", &stage_amg_setup)); + } + PetscCall(PetscLogStagePush(stage_amg_setup)); + PetscCall(PetscLogEventBegin(MATCEED_SETUP_FULL, mat_ceed, mat_coo, NULL, NULL)); + PetscCall(PetscLogEventBegin(MATCEED_SETUP_FULL_CEEDOP, mat_ceed, mat_coo, NULL, NULL)); + PetscCallCeed(ctx->ceed, CeedOperatorLinearAssembleSymbolic(ctx->op_mult, &num_entries, &rows_ceed, &cols_ceed)); + PetscCall(PetscLogEventEnd(MATCEED_SETUP_FULL_CEEDOP, mat_ceed, mat_coo, NULL, NULL)); + PetscCall(IntArrayCeedToPetsc(num_entries, &rows_ceed, &rows_petsc)); + PetscCall(IntArrayCeedToPetsc(num_entries, &cols_ceed, &cols_petsc)); + PetscCall(MatSetPreallocationCOOLocal(mat_coo, num_entries, rows_petsc, cols_petsc)); + free(rows_petsc); + free(cols_petsc); + if (!ctx->coo_values_full) PetscCallCeed(ctx->ceed, CeedVectorCreate(ctx->ceed, num_entries, &ctx->coo_values_full)); + PetscCall(PetscRealloc(++ctx->num_mats_assembled_full * sizeof(Mat), &ctx->mats_assembled_full)); + ctx->mats_assembled_full[ctx->num_mats_assembled_full - 1] = mat_coo; + PetscCall(PetscLogEventEnd(MATCEED_SETUP_FULL, mat_ceed, mat_coo, NULL, NULL)); + PetscCall(PetscLogStagePop()); + } + PetscFunctionReturn(PETSC_SUCCESS); +} + /** @brief Assemble a `MATCEED` into a `MATAIJ` or similar. The `mat_coo` preallocation is set to match the sparsity pattern of `mat_ceed`. @@ -543,39 +703,18 @@ PetscErrorCode MatCeedAssembleCOO(Mat mat_ceed, Mat mat_coo) { PetscFunctionBeginUser; PetscCall(MatShellGetContext(mat_ceed, &ctx)); - // Check if COO pattern set + // Set COO pattern if needed { - PetscInt index = -1; + CeedInt index = -1; for (PetscInt i = 0; i < ctx->num_mats_assembled_full; i++) { if (ctx->mats_assembled_full[i] == mat_coo) index = i; } - if (index == -1) { - PetscInt *rows_petsc = NULL, *cols_petsc = NULL; - CeedInt *rows_ceed, *cols_ceed; - PetscCount num_entries; - PetscLogStage stage_amg_setup; - - // -- Assemble sparsity pattern if mat hasn't been assembled before - PetscCall(PetscLogStageGetId("MATCEED Assembly Setup", &stage_amg_setup)); - if (stage_amg_setup == -1) { - PetscCall(PetscLogStageRegister("MATCEED Assembly Setup", &stage_amg_setup)); - } - PetscCall(PetscLogStagePush(stage_amg_setup)); - PetscCallCeed(ctx->ceed, CeedOperatorLinearAssembleSymbolic(ctx->op_mult, &num_entries, &rows_ceed, &cols_ceed)); - PetscCall(IntArrayCeedToPetsc(num_entries, &rows_ceed, &rows_petsc)); - PetscCall(IntArrayCeedToPetsc(num_entries, &cols_ceed, &cols_petsc)); - PetscCall(MatSetPreallocationCOOLocal(mat_coo, num_entries, rows_petsc, cols_petsc)); - free(rows_petsc); - free(cols_petsc); - if (!ctx->coo_values_full) PetscCallCeed(ctx->ceed, CeedVectorCreate(ctx->ceed, num_entries, &ctx->coo_values_full)); - PetscCall(PetscRealloc(++ctx->num_mats_assembled_full * sizeof(Mat), &ctx->mats_assembled_full)); - ctx->mats_assembled_full[ctx->num_mats_assembled_full - 1] = mat_coo; - PetscCall(PetscLogStagePop()); - } + if (index == -1) PetscCall(MatCeedSetPreallocationCOO(mat_ceed, mat_coo)); } // Assemble mat_ceed + PetscCall(PetscLogEventBegin(MATCEED_ASSEMBLE_FULL, mat_ceed, mat_coo, NULL, NULL)); PetscCall(MatAssemblyBegin(mat_coo, MAT_FINAL_ASSEMBLY)); { const CeedScalar *values; @@ -588,7 +727,9 @@ PetscErrorCode MatCeedAssembleCOO(Mat mat_ceed, Mat mat_coo) { else if (strstr(mat_type, "kokkos")) mem_type = CEED_MEM_DEVICE; else mem_type = CEED_MEM_HOST; + PetscCall(PetscLogEventBegin(MATCEED_ASSEMBLE_FULL_CEEDOP, mat_ceed, mat_coo, NULL, NULL)); PetscCallCeed(ctx->ceed, CeedOperatorLinearAssemble(ctx->op_mult, ctx->coo_values_full)); + PetscCall(PetscLogEventEnd(MATCEED_ASSEMBLE_FULL_CEEDOP, mat_ceed, mat_coo, NULL, NULL)); PetscCallCeed(ctx->ceed, CeedVectorGetArrayRead(ctx->coo_values_full, mem_type, &values)); PetscCall(MatSetValuesCOO(mat_coo, values, INSERT_VALUES)); PetscCall(MatIsSPDKnown(mat_ceed, &is_spd_known, &is_spd)); @@ -596,6 +737,222 @@ PetscErrorCode MatCeedAssembleCOO(Mat mat_ceed, Mat mat_coo) { PetscCallCeed(ctx->ceed, CeedVectorRestoreArrayRead(ctx->coo_values_full, &values)); } PetscCall(MatAssemblyEnd(mat_coo, MAT_FINAL_ASSEMBLY)); + PetscCall(PetscLogEventEnd(MATCEED_ASSEMBLE_FULL, mat_ceed, mat_coo, NULL, NULL)); + PetscFunctionReturn(PETSC_SUCCESS); +} + +/** + @brief Set the current value of a context field for a `MatCEED`. + + Not collective across MPI processes. + + @param[in,out] mat `MatCEED` + @param[in] name Name of the context field + @param[in] value New context field value + + @return An error code: 0 - success, otherwise - failure +**/ +PetscErrorCode MatCeedSetContextDouble(Mat mat, const char *name, double value) { + PetscBool was_updated = PETSC_FALSE; + MatCeedContext ctx; + + PetscFunctionBeginUser; + PetscCall(MatShellGetContext(mat, &ctx)); + { + CeedContextFieldLabel label = NULL; + + PetscCallCeed(ctx->ceed, CeedOperatorGetContextFieldLabel(ctx->op_mult, name, &label)); + if (label) { + double set_value = 2 * value + 1.0; + + PetscCall(MatCeedGetContextDouble(mat, name, &set_value)); + if (set_value != value) { + PetscCallCeed(ctx->ceed, CeedOperatorSetContextDouble(ctx->op_mult, label, &value)); + was_updated = PETSC_TRUE; + } + } + if (ctx->op_mult_transpose) { + label = NULL; + PetscCallCeed(ctx->ceed, CeedOperatorGetContextFieldLabel(ctx->op_mult_transpose, name, &label)); + if (label) { + double set_value = 2 * value + 1.0; + + PetscCall(MatCeedGetContextDouble(mat, name, &set_value)); + if (set_value != value) { + PetscCallCeed(ctx->ceed, CeedOperatorSetContextDouble(ctx->op_mult_transpose, label, &value)); + was_updated = PETSC_TRUE; + } + } + } + } + if (was_updated) { + PetscCall(MatAssemblyBegin(mat, MAT_FINAL_ASSEMBLY)); + PetscCall(MatAssemblyEnd(mat, MAT_FINAL_ASSEMBLY)); + } + PetscFunctionReturn(PETSC_SUCCESS); +} + +/** + @brief Get the current value of a context field for a `MatCEED`. + + Not collective across MPI processes. + + @param[in] mat `MatCEED` + @param[in] name Name of the context field + @param[out] value Current context field value + + @return An error code: 0 - success, otherwise - failure +**/ +PetscErrorCode MatCeedGetContextDouble(Mat mat, const char *name, double *value) { + MatCeedContext ctx; + + PetscFunctionBeginUser; + PetscCall(MatShellGetContext(mat, &ctx)); + { + CeedContextFieldLabel label = NULL; + CeedOperator op = ctx->op_mult; + + PetscCallCeed(ctx->ceed, CeedOperatorGetContextFieldLabel(op, name, &label)); + if (!label && ctx->op_mult_transpose) { + op = ctx->op_mult_transpose; + PetscCallCeed(ctx->ceed, CeedOperatorGetContextFieldLabel(op, name, &label)); + } + if (label) { + PetscSizeT num_values; + const double *values_ceed; + + PetscCallCeed(ctx->ceed, CeedOperatorGetContextDoubleRead(op, label, &num_values, &values_ceed)); + *value = values_ceed[0]; + PetscCallCeed(ctx->ceed, CeedOperatorRestoreContextDoubleRead(op, label, &values_ceed)); + } + } + PetscFunctionReturn(PETSC_SUCCESS); +} + +/** + @brief Set the current `PetscReal` value of a context field for a `MatCEED`. + + Not collective across MPI processes. + + @param[in,out] mat `MatCEED` + @param[in] name Name of the context field + @param[in] value New context field value + + @return An error code: 0 - success, otherwise - failure +**/ +PetscErrorCode MatCeedSetContextReal(Mat mat, const char *name, PetscReal value) { + double value_double = value; + + PetscFunctionBeginUser; + PetscCall(MatCeedSetContextDouble(mat, name, value_double)); + PetscFunctionReturn(PETSC_SUCCESS); +} + +/** + @brief Get the current `PetscReal` value of a context field for a `MatCEED`. + + Not collective across MPI processes. + + @param[in] mat `MatCEED` + @param[in] name Name of the context field + @param[out] value Current context field value + + @return An error code: 0 - success, otherwise - failure +**/ +PetscErrorCode MatCeedGetContextReal(Mat mat, const char *name, PetscReal *value) { + double value_double = 0.0; + + PetscFunctionBeginUser; + PetscCall(MatCeedGetContextDouble(mat, name, &value_double)); + *value = value_double; + PetscFunctionReturn(PETSC_SUCCESS); +} + +/** + @brief Set the current time for a `MatCEED`. + + Not collective across MPI processes. + + @param[in,out] mat `MatCEED` + @param[in] time Current time + + @return An error code: 0 - success, otherwise - failure +**/ +PetscErrorCode MatCeedSetTime(Mat mat, PetscReal time) { + PetscFunctionBeginUser; + { + double time_ceed = time; + + PetscCall(MatCeedSetContextDouble(mat, "time", time_ceed)); + } + PetscFunctionReturn(PETSC_SUCCESS); +} + +/** + @brief Get the current time for a `MatCEED`. + + Not collective across MPI processes. + + @param[in] mat `MatCEED` + @param[out] time Current time, or -1.0 if the boundary evaluator has no time field + + @return An error code: 0 - success, otherwise - failure +**/ +PetscErrorCode MatCeedGetTime(Mat mat, PetscReal *time) { + PetscFunctionBeginUser; + *time = -1.0; + { + double time_ceed = -1.0; + + PetscCall(MatCeedGetContextDouble(mat, "time", &time_ceed)); + *time = time_ceed; + } + PetscFunctionReturn(PETSC_SUCCESS); +} + +/** + @brief Set the current time step for a `MatCEED`. + + Not collective across MPI processes. + + @param[in,out] mat `MatCEED` + @param[in] dt Current time step + + @return An error code: 0 - success, otherwise - failure +**/ +PetscErrorCode MatCeedSetDt(Mat mat, PetscReal dt) { + PetscFunctionBeginUser; + { + double dt_ceed = dt; + + PetscCall(MatCeedSetContextDouble(mat, "dt", dt_ceed)); + } + PetscFunctionReturn(PETSC_SUCCESS); +} + +/** + @brief Set the Jacobian shifts for a `MatCEED`. + + Not collective across MPI processes. + + @param[in,out] mat `MatCEED` + @param[in] shift_v Velocity shift + @param[in] shift_a Acceleration shift + + @return An error code: 0 - success, otherwise - failure +**/ +PetscErrorCode MatCeedSetShifts(Mat mat, PetscReal shift_v, PetscReal shift_a) { + PetscFunctionBeginUser; + { + double shift_v_ceed = shift_v; + + PetscCall(MatCeedSetContextDouble(mat, "shift v", shift_v_ceed)); + } + if (shift_a) { + double shift_a_ceed = shift_a; + + PetscCall(MatCeedSetContextDouble(mat, "shift a", shift_a_ceed)); + } PetscFunctionReturn(PETSC_SUCCESS); } @@ -610,14 +967,14 @@ PetscErrorCode MatCeedAssembleCOO(Mat mat_ceed, Mat mat_coo) { @return An error code: 0 - success, otherwise - failure **/ -PetscErrorCode MatCeedSetContext(Mat mat, PetscErrorCode (*f)(void *), void *ctx) { +PetscErrorCode MatCeedSetContext(Mat mat, PetscCtxDestroyFn f, void *ctx) { PetscContainer user_ctx = NULL; PetscFunctionBeginUser; if (ctx) { PetscCall(PetscContainerCreate(PetscObjectComm((PetscObject)mat), &user_ctx)); PetscCall(PetscContainerSetPointer(user_ctx, ctx)); - PetscCall(PetscContainerSetUserDestroy(user_ctx, f)); + PetscCall(PetscContainerSetCtxDestroy(user_ctx, f)); } PetscCall(PetscObjectCompose((PetscObject)mat, "MatCeed user context", (PetscObject)user_ctx)); PetscCall(PetscContainerDestroy(&user_ctx)); @@ -643,18 +1000,37 @@ PetscErrorCode MatCeedGetContext(Mat mat, void *ctx) { else *(void **)ctx = NULL; PetscFunctionReturn(PETSC_SUCCESS); } +/** + @brief Set a user defined matrix operation for a `MATCEED` matrix. + + Within each user-defined routine, the user should call `MatCeedGetContext()` to obtain the user-defined context that was set by +`MatCeedSetContext()`. + + Collective across MPI processes. + + @param[in,out] mat `MATCEED` + @param[in] op Name of the `MatOperation` + @param[in] g Function that provides the operation + + @return An error code: 0 - success, otherwise - failure +**/ +PetscErrorCode MatCeedSetOperation(Mat mat, MatOperation op, void (*g)(void)) { + PetscFunctionBeginUser; + PetscCall(MatShellSetOperation(mat, op, g)); + PetscFunctionReturn(PETSC_SUCCESS); +} /** - @brief Sets the inner matrix type as a string from the `MATCEED`. + @brief Sets the default COO matrix type as a string from the `MATCEED`. Collective across MPI processes. @param[in,out] mat `MATCEED` - @param[in] type Inner `MatType` to set + @param[in] type COO `MatType` to set @return An error code: 0 - success, otherwise - failure **/ -PetscErrorCode MatCeedSetInnerMatType(Mat mat, MatType type) { +PetscErrorCode MatCeedSetCOOMatType(Mat mat, MatType type) { MatCeedContext ctx; PetscFunctionBeginUser; @@ -664,9 +1040,9 @@ PetscErrorCode MatCeedSetInnerMatType(Mat mat, MatType type) { size_t len_old, len_new; PetscBool is_same = PETSC_FALSE; - PetscCall(PetscStrlen(ctx->internal_mat_type, &len_old)); + PetscCall(PetscStrlen(ctx->coo_mat_type, &len_old)); PetscCall(PetscStrlen(type, &len_new)); - if (len_old == len_new) PetscCall(PetscStrncmp(ctx->internal_mat_type, type, len_old, &is_same)); + if (len_old == len_new) PetscCall(PetscStrncmp(ctx->coo_mat_type, type, len_old, &is_same)); if (is_same) PetscFunctionReturn(PETSC_SUCCESS); } // Clean up old mats in different format @@ -695,48 +1071,28 @@ PetscErrorCode MatCeedSetInnerMatType(Mat mat, MatType type) { } } } - PetscCall(PetscFree(ctx->internal_mat_type)); - PetscCall(PetscStrallocpy(type, &ctx->internal_mat_type)); + PetscCall(PetscFree(ctx->coo_mat_type)); + PetscCall(PetscStrallocpy(type, &ctx->coo_mat_type)); PetscFunctionReturn(PETSC_SUCCESS); // LCOV_EXCL_STOP } /** - @brief Gets the inner matrix type as a string from the `MATCEED`. + @brief Gets the default COO matrix type as a string from the `MATCEED`. Collective across MPI processes. @param[in,out] mat `MATCEED` - @param[in] type Inner `MatType` + @param[in] type COO `MatType` @return An error code: 0 - success, otherwise - failure **/ -PetscErrorCode MatCeedGetInnerMatType(Mat mat, MatType *type) { +PetscErrorCode MatCeedGetCOOMatType(Mat mat, MatType *type) { MatCeedContext ctx; PetscFunctionBeginUser; PetscCall(MatShellGetContext(mat, &ctx)); - *type = ctx->internal_mat_type; - PetscFunctionReturn(PETSC_SUCCESS); -} - -/** - @brief Set a user defined matrix operation for a `MATCEED` matrix. - - Within each user-defined routine, the user should call `MatCeedGetContext()` to obtain the user-defined context that was set by -`MatCeedSetContext()`. - - Collective across MPI processes. - - @param[in,out] mat `MATCEED` - @param[in] op Name of the `MatOperation` - @param[in] g Function that provides the operation - - @return An error code: 0 - success, otherwise - failure -**/ -PetscErrorCode MatCeedSetOperation(Mat mat, MatOperation op, void (*g)(void)) { - PetscFunctionBeginUser; - PetscCall(MatShellSetOperation(mat, op, g)); + *type = ctx->coo_mat_type; PetscFunctionReturn(PETSC_SUCCESS); } @@ -763,9 +1119,7 @@ PetscErrorCode MatCeedSetLocalVectors(Mat mat, Vec X_loc, Vec Y_loc_transpose) { PetscCall(VecGetSize(X_loc, &len_new)); PetscCheck(len_old == len_new, PETSC_COMM_SELF, PETSC_ERR_LIB, "new X_loc length %" PetscInt_FMT " should match old X_loc length %" PetscInt_FMT, len_new, len_old); - PetscCall(VecDestroy(&ctx->X_loc)); - ctx->X_loc = X_loc; - PetscCall(PetscObjectReference((PetscObject)X_loc)); + PetscCall(VecReferenceCopy(X_loc, &ctx->X_loc)); } if (Y_loc_transpose) { PetscInt len_old, len_new; @@ -774,9 +1128,7 @@ PetscErrorCode MatCeedSetLocalVectors(Mat mat, Vec X_loc, Vec Y_loc_transpose) { PetscCall(VecGetSize(Y_loc_transpose, &len_new)); PetscCheck(len_old == len_new, PETSC_COMM_SELF, PETSC_ERR_LIB, "new Y_loc_transpose length %" PetscInt_FMT " should match old Y_loc_transpose length %" PetscInt_FMT, len_new, len_old); - PetscCall(VecDestroy(&ctx->Y_loc_transpose)); - ctx->Y_loc_transpose = Y_loc_transpose; - PetscCall(PetscObjectReference((PetscObject)Y_loc_transpose)); + PetscCall(VecReferenceCopy(Y_loc_transpose, &ctx->Y_loc_transpose)); } PetscFunctionReturn(PETSC_SUCCESS); } @@ -798,12 +1150,12 @@ PetscErrorCode MatCeedGetLocalVectors(Mat mat, Vec *X_loc, Vec *Y_loc_transpose) PetscFunctionBeginUser; PetscCall(MatShellGetContext(mat, &ctx)); if (X_loc) { - *X_loc = ctx->X_loc; - PetscCall(PetscObjectReference((PetscObject)*X_loc)); + *X_loc = NULL; + PetscCall(VecReferenceCopy(ctx->X_loc, X_loc)); } if (Y_loc_transpose) { - *Y_loc_transpose = ctx->Y_loc_transpose; - PetscCall(PetscObjectReference((PetscObject)*Y_loc_transpose)); + *Y_loc_transpose = NULL; + PetscCall(VecReferenceCopy(ctx->Y_loc_transpose, Y_loc_transpose)); } PetscFunctionReturn(PETSC_SUCCESS); } @@ -916,6 +1268,48 @@ PetscErrorCode MatCeedGetLogEvents(Mat mat, PetscLogEvent *log_event_mult, Petsc PetscFunctionReturn(PETSC_SUCCESS); } +/** + @brief Set `CeedOperator` `PetscLogEvent` for `MATCEED` `MatMult()` and `MatMultTranspose()` operators. + + Not collective across MPI processes. + + @param[in,out] mat MatCeed + @param[out] log_event_mult `PetscLogEvent` for forward `CeedOperator` evaluation, or NULL + @param[out] log_event_mult_transpose `PetscLogEvent` for transpose `CeedOperator` evaluation, or NULL + + @return An error code: 0 - success, otherwise - failure +**/ +PetscErrorCode MatCeedSetCeedOperatorLogEvents(Mat mat, PetscLogEvent log_event_mult, PetscLogEvent log_event_mult_transpose) { + MatCeedContext ctx; + + PetscFunctionBeginUser; + PetscCall(MatShellGetContext(mat, &ctx)); + if (log_event_mult) ctx->log_event_ceed_mult = log_event_mult; + if (log_event_mult_transpose) ctx->log_event_ceed_mult_transpose = log_event_mult_transpose; + PetscFunctionReturn(PETSC_SUCCESS); +} + +/** + @brief Get `CeedOperator` `PetscLogEvent` for `MATCEED` `MatMult()` and `MatMultTranspose()` operators. + + Not collective across MPI processes. + + @param[in,out] mat MatCeed + @param[out] log_event_mult `PetscLogEvent` for forward `CeedOperator` evaluation, or NULL + @param[out] log_event_mult_transpose `PetscLogEvent` for transpose `CeedOperator` evaluation, or NULL + + @return An error code: 0 - success, otherwise - failure +**/ +PetscErrorCode MatCeedGetCeedOperatorLogEvents(Mat mat, PetscLogEvent *log_event_mult, PetscLogEvent *log_event_mult_transpose) { + MatCeedContext ctx; + + PetscFunctionBeginUser; + PetscCall(MatShellGetContext(mat, &ctx)); + if (log_event_mult) *log_event_mult = ctx->log_event_ceed_mult; + if (log_event_mult_transpose) *log_event_mult_transpose = ctx->log_event_ceed_mult_transpose; + PetscFunctionReturn(PETSC_SUCCESS); +} + // ----------------------------------------------------------------------------- // Operator context data // ----------------------------------------------------------------------------- @@ -925,20 +1319,23 @@ PetscErrorCode MatCeedGetLogEvents(Mat mat, PetscLogEvent *log_event_mult, Petsc Collective across MPI processes. - @param[in] dm_x Input `DM` - @param[in] dm_y Output `DM` - @param[in] X_loc Input PETSc local vector, or NULL - @param[in] Y_loc_transpose Input PETSc local vector for transpose operation, or NULL - @param[in] op_mult `CeedOperator` for forward evaluation - @param[in] op_mult_transpose `CeedOperator` for transpose evaluation - @param[in] log_event_mult `PetscLogEvent` for forward evaluation - @param[in] log_event_mult_transpose `PetscLogEvent` for transpose evaluation - @param[out] ctx Context data for operator evaluation + @param[in] dm_x Input `DM` + @param[in] dm_y Output `DM` + @param[in] X_loc Input PETSc local vector, or NULL + @param[in] Y_loc_transpose Input PETSc local vector for transpose operation, or NULL + @param[in] op_mult `CeedOperator` for forward evaluation + @param[in] op_mult_transpose `CeedOperator` for transpose evaluation + @param[in] log_event_mult `PetscLogEvent` for forward evaluation + @param[in] log_event_mult_transpose `PetscLogEvent` for transpose evaluation + @param[in] log_event_ceed_mult `PetscLogEvent` for forward `CeedOperator` evaluation + @param[in] log_event_ceed_mult_transpose `PetscLogEvent` for transpose `CeedOperator` evaluation + @param[out] ctx Context data for operator evaluation @return An error code: 0 - success, otherwise - failure **/ PetscErrorCode MatCeedContextCreate(DM dm_x, DM dm_y, Vec X_loc, Vec Y_loc_transpose, CeedOperator op_mult, CeedOperator op_mult_transpose, - PetscLogEvent log_event_mult, PetscLogEvent log_event_mult_transpose, MatCeedContext *ctx) { + PetscLogEvent log_event_mult, PetscLogEvent log_event_mult_transpose, PetscLogEvent log_event_ceed_mult, + PetscLogEvent log_event_ceed_mult_transpose, MatCeedContext *ctx) { CeedSize x_loc_len, y_loc_len; PetscFunctionBeginUser; @@ -948,18 +1345,16 @@ PetscErrorCode MatCeedContextCreate(DM dm_x, DM dm_y, Vec X_loc, Vec Y_loc_trans (*ctx)->ref_count = 1; // Logging - (*ctx)->log_event_mult = log_event_mult; - (*ctx)->log_event_mult_transpose = log_event_mult_transpose; + (*ctx)->log_event_mult = log_event_mult; + (*ctx)->log_event_mult_transpose = log_event_mult_transpose; + (*ctx)->log_event_ceed_mult = log_event_ceed_mult; + (*ctx)->log_event_ceed_mult_transpose = log_event_ceed_mult_transpose; // PETSc objects - PetscCall(PetscObjectReference((PetscObject)dm_x)); - (*ctx)->dm_x = dm_x; - PetscCall(PetscObjectReference((PetscObject)dm_y)); - (*ctx)->dm_y = dm_y; - if (X_loc) PetscCall(PetscObjectReference((PetscObject)X_loc)); - (*ctx)->X_loc = X_loc; - if (Y_loc_transpose) PetscCall(PetscObjectReference((PetscObject)Y_loc_transpose)); - (*ctx)->Y_loc_transpose = Y_loc_transpose; + PetscCall(DMReferenceCopy(dm_x, &(*ctx)->dm_x)); + PetscCall(DMReferenceCopy(dm_y, &(*ctx)->dm_y)); + if (X_loc) PetscCall(VecReferenceCopy(X_loc, &(*ctx)->X_loc)); + if (Y_loc_transpose) PetscCall(VecReferenceCopy(Y_loc_transpose, &(*ctx)->Y_loc_transpose)); // Memtype { @@ -975,7 +1370,6 @@ PetscErrorCode MatCeedContextCreate(DM dm_x, DM dm_y, Vec X_loc, Vec Y_loc_trans // libCEED objects PetscCheck(CeedOperatorGetCeed(op_mult, &(*ctx)->ceed) == CEED_ERROR_SUCCESS, PETSC_COMM_SELF, PETSC_ERR_LIB, "retrieving Ceed context object failed"); - PetscCallCeed((*ctx)->ceed, CeedReference((*ctx)->ceed)); PetscCallCeed((*ctx)->ceed, CeedOperatorGetActiveVectorLengths(op_mult, &x_loc_len, &y_loc_len)); PetscCallCeed((*ctx)->ceed, CeedOperatorReferenceCopy(op_mult, &(*ctx)->op_mult)); if (op_mult_transpose) PetscCallCeed((*ctx)->ceed, CeedOperatorReferenceCopy(op_mult_transpose, &(*ctx)->op_mult_transpose)); @@ -1062,7 +1456,7 @@ PetscErrorCode MatCeedContextReference(MatCeedContext ctx) { PetscErrorCode MatCeedContextReferenceCopy(MatCeedContext ctx, MatCeedContext *ctx_copy) { PetscFunctionBeginUser; PetscCall(MatCeedContextReference(ctx)); - PetscCall(MatCeedContextDestroy(*ctx_copy)); + PetscCall(MatCeedContextDestroy(ctx_copy)); *ctx_copy = ctx; PetscFunctionReturn(PETSC_SUCCESS); } @@ -1076,33 +1470,33 @@ PetscErrorCode MatCeedContextReferenceCopy(MatCeedContext ctx, MatCeedContext *c @return An error code: 0 - success, otherwise - failure **/ -PetscErrorCode MatCeedContextDestroy(MatCeedContext ctx) { +PetscErrorCode MatCeedContextDestroy(MatCeedContext *ctx) { PetscFunctionBeginUser; - if (!ctx || --ctx->ref_count > 0) PetscFunctionReturn(PETSC_SUCCESS); + if (!ctx || --(*ctx)->ref_count > 0) PetscFunctionReturn(PETSC_SUCCESS); // PETSc objects - PetscCall(DMDestroy(&ctx->dm_x)); - PetscCall(DMDestroy(&ctx->dm_y)); - PetscCall(VecDestroy(&ctx->X_loc)); - PetscCall(VecDestroy(&ctx->Y_loc_transpose)); - PetscCall(MatDestroy(&ctx->mat_assembled_full_internal)); - PetscCall(MatDestroy(&ctx->mat_assembled_pbd_internal)); - PetscCall(PetscFree(ctx->internal_mat_type)); - PetscCall(PetscFree(ctx->mats_assembled_full)); - PetscCall(PetscFree(ctx->mats_assembled_pbd)); + PetscCall(DMDestroy(&(*ctx)->dm_x)); + PetscCall(DMDestroy(&(*ctx)->dm_y)); + PetscCall(VecDestroy(&(*ctx)->X_loc)); + PetscCall(VecDestroy(&(*ctx)->Y_loc_transpose)); + PetscCall(MatDestroy(&(*ctx)->mat_assembled_full_internal)); + PetscCall(MatDestroy(&(*ctx)->mat_assembled_pbd_internal)); + PetscCall(PetscFree((*ctx)->coo_mat_type)); + PetscCall(PetscFree((*ctx)->mats_assembled_full)); + PetscCall(PetscFree((*ctx)->mats_assembled_pbd)); // libCEED objects - PetscCallCeed(ctx->ceed, CeedVectorDestroy(&ctx->x_loc)); - PetscCallCeed(ctx->ceed, CeedVectorDestroy(&ctx->y_loc)); - PetscCallCeed(ctx->ceed, CeedVectorDestroy(&ctx->coo_values_full)); - PetscCallCeed(ctx->ceed, CeedVectorDestroy(&ctx->coo_values_pbd)); - PetscCallCeed(ctx->ceed, CeedOperatorDestroy(&ctx->op_mult)); - PetscCallCeed(ctx->ceed, CeedOperatorDestroy(&ctx->op_mult_transpose)); - PetscCheck(CeedDestroy(&ctx->ceed) == CEED_ERROR_SUCCESS, PETSC_COMM_SELF, PETSC_ERR_LIB, "destroying libCEED context object failed"); + PetscCallCeed((*ctx)->ceed, CeedVectorDestroy(&(*ctx)->x_loc)); + PetscCallCeed((*ctx)->ceed, CeedVectorDestroy(&(*ctx)->y_loc)); + PetscCallCeed((*ctx)->ceed, CeedVectorDestroy(&(*ctx)->coo_values_full)); + PetscCallCeed((*ctx)->ceed, CeedVectorDestroy(&(*ctx)->coo_values_pbd)); + PetscCallCeed((*ctx)->ceed, CeedOperatorDestroy(&(*ctx)->op_mult)); + PetscCallCeed((*ctx)->ceed, CeedOperatorDestroy(&(*ctx)->op_mult_transpose)); + PetscCheck(CeedDestroy(&(*ctx)->ceed) == CEED_ERROR_SUCCESS, PETSC_COMM_SELF, PETSC_ERR_LIB, "destroying libCEED context object failed"); // Deallocate - ctx->is_destroyed = PETSC_TRUE; // Flag as destroyed in case someone has stale ref - PetscCall(PetscFree(ctx)); + (*ctx)->is_destroyed = PETSC_TRUE; // Flag as destroyed in case someone has stale ref + PetscCall(PetscFree(*ctx)); PetscFunctionReturn(PETSC_SUCCESS); } @@ -1125,11 +1519,14 @@ PetscErrorCode MatGetDiagonal_Ceed(Mat A, Vec D) { PetscCall(MatShellGetContext(A, &ctx)); // Place PETSc vector in libCEED vector + PetscCall(PetscLogEventBegin(MATCEED_ASSEMBLE_DIAGONAL, A, D, NULL, NULL)); PetscCall(DMGetLocalVector(ctx->dm_x, &D_loc)); PetscCall(VecPetscToCeed(D_loc, &mem_type, ctx->x_loc)); // Compute Diagonal + PetscCall(PetscLogEventBegin(MATCEED_ASSEMBLE_DIAGONAL_CEEDOP, A, D, NULL, NULL)); PetscCallCeed(ctx->ceed, CeedOperatorLinearAssembleDiagonal(ctx->op_mult, ctx->x_loc, CEED_REQUEST_IMMEDIATE)); + PetscCall(PetscLogEventEnd(MATCEED_ASSEMBLE_DIAGONAL_CEEDOP, A, D, NULL, NULL)); // Restore PETSc vector PetscCall(VecCeedToPetsc(ctx->x_loc, mem_type, D_loc)); @@ -1138,6 +1535,7 @@ PetscErrorCode MatGetDiagonal_Ceed(Mat A, Vec D) { PetscCall(VecZeroEntries(D)); PetscCall(DMLocalToGlobal(ctx->dm_x, D_loc, ADD_VALUES, D)); PetscCall(DMRestoreLocalVector(ctx->dm_x, &D_loc)); + PetscCall(PetscLogEventEnd(MATCEED_ASSEMBLE_DIAGONAL, A, D, NULL, NULL)); PetscFunctionReturn(PETSC_SUCCESS); } @@ -1157,7 +1555,7 @@ PetscErrorCode MatMult_Ceed(Mat A, Vec X, Vec Y) { PetscFunctionBeginUser; PetscCall(MatShellGetContext(A, &ctx)); - PetscCall(PetscLogEventBegin(ctx->log_event_mult, A, X, Y, 0)); + PetscCall(PetscLogEventBegin(ctx->log_event_mult, A, X, Y, NULL)); { PetscMemType x_mem_type, y_mem_type; @@ -1176,9 +1574,11 @@ PetscErrorCode MatMult_Ceed(Mat A, Vec X, Vec Y) { PetscCall(VecPetscToCeed(Y_loc, &y_mem_type, ctx->y_loc)); // Apply libCEED operator + PetscCall(PetscLogEventBegin(ctx->log_event_ceed_mult, A, X, Y, NULL)); PetscCall(PetscLogGpuTimeBegin()); PetscCallCeed(ctx->ceed, CeedOperatorApplyAdd(ctx->op_mult, ctx->x_loc, ctx->y_loc, CEED_REQUEST_IMMEDIATE)); PetscCall(PetscLogGpuTimeEnd()); + PetscCall(PetscLogEventEnd(ctx->log_event_ceed_mult, A, X, Y, NULL)); // Restore PETSc vectors PetscCall(VecReadCeedToPetsc(ctx->x_loc, x_mem_type, X_loc)); @@ -1196,8 +1596,7 @@ PetscErrorCode MatMult_Ceed(Mat A, Vec X, Vec Y) { // Log flops if (PetscMemTypeDevice(ctx->mem_type)) PetscCall(PetscLogGpuFlops(ctx->flops_mult)); else PetscCall(PetscLogFlops(ctx->flops_mult)); - - PetscCall(PetscLogEventEnd(ctx->log_event_mult, A, X, Y, 0)); + PetscCall(PetscLogEventEnd(ctx->log_event_mult, A, X, Y, NULL)); PetscFunctionReturn(PETSC_SUCCESS); } @@ -1217,7 +1616,7 @@ PetscErrorCode MatMultTranspose_Ceed(Mat A, Vec Y, Vec X) { PetscFunctionBeginUser; PetscCall(MatShellGetContext(A, &ctx)); - PetscCall(PetscLogEventBegin(ctx->log_event_mult_transpose, A, Y, X, 0)); + PetscCall(PetscLogEventBegin(ctx->log_event_mult_transpose, A, Y, X, NULL)); { PetscMemType x_mem_type, y_mem_type; @@ -1236,9 +1635,11 @@ PetscErrorCode MatMultTranspose_Ceed(Mat A, Vec Y, Vec X) { PetscCall(VecPetscToCeed(X_loc, &x_mem_type, ctx->x_loc)); // Apply libCEED operator + PetscCall(PetscLogEventBegin(ctx->log_event_ceed_mult_transpose, A, Y, X, NULL)); PetscCall(PetscLogGpuTimeBegin()); PetscCallCeed(ctx->ceed, CeedOperatorApplyAdd(ctx->op_mult_transpose, ctx->y_loc, ctx->x_loc, CEED_REQUEST_IMMEDIATE)); PetscCall(PetscLogGpuTimeEnd()); + PetscCall(PetscLogEventEnd(ctx->log_event_ceed_mult_transpose, A, Y, X, NULL)); // Restore PETSc vectors PetscCall(VecReadCeedToPetsc(ctx->y_loc, y_mem_type, Y_loc)); @@ -1256,7 +1657,6 @@ PetscErrorCode MatMultTranspose_Ceed(Mat A, Vec Y, Vec X) { // Log flops if (PetscMemTypeDevice(ctx->mem_type)) PetscCall(PetscLogGpuFlops(ctx->flops_mult_transpose)); else PetscCall(PetscLogFlops(ctx->flops_mult_transpose)); - - PetscCall(PetscLogEventEnd(ctx->log_event_mult_transpose, A, Y, X, 0)); + PetscCall(PetscLogEventEnd(ctx->log_event_mult_transpose, A, Y, X, NULL)); PetscFunctionReturn(PETSC_SUCCESS); } diff --git a/examples/fluids/src/misc.c b/examples/fluids/src/misc.c index e73769741c..cebed9689b 100644 --- a/examples/fluids/src/misc.c +++ b/examples/fluids/src/misc.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -95,6 +95,7 @@ PetscErrorCode DMPlexInsertBoundaryValues_FromICs(DM dm, PetscBool insert_essent static PetscErrorCode BinaryReadIntoInt(PetscViewer viewer, PetscInt *out, PetscDataType file_type) { PetscFunctionBeginUser; + *out = -13; // appease the overzealous GCC compiler warning Gods if (file_type == PETSC_INT32) { PetscInt32 val; PetscCall(PetscViewerBinaryRead(viewer, &val, 1, NULL, PETSC_INT32)); @@ -140,31 +141,34 @@ PetscErrorCode LoadFluidsBinaryVec(MPI_Comm comm, PetscViewer viewer, Vec Q, Pet // Compare reference solution values with current test run for CI PetscErrorCode RegressionTest(AppCtx app_ctx, Vec Q) { - Vec Qref; + Vec Q_ref; PetscViewer viewer; - PetscReal error, Qrefnorm; + PetscReal error, norm_Q, norm_Q_ref; MPI_Comm comm = PetscObjectComm((PetscObject)Q); PetscFunctionBeginUser; // Read reference file - PetscCall(VecDuplicate(Q, &Qref)); + PetscCall(VecDuplicate(Q, &Q_ref)); + PetscCheck(strcmp(app_ctx->test_file_path, "") != 0, comm, PETSC_ERR_FILE_READ, "File for regression test not given"); PetscCall(PetscViewerBinaryOpen(comm, app_ctx->test_file_path, FILE_MODE_READ, &viewer)); - PetscCall(LoadFluidsBinaryVec(comm, viewer, Qref, NULL, NULL)); + PetscCall(LoadFluidsBinaryVec(comm, viewer, Q_ref, NULL, NULL)); // Compute error with respect to reference solution - PetscCall(VecAXPY(Q, -1.0, Qref)); - PetscCall(VecNorm(Qref, NORM_MAX, &Qrefnorm)); - PetscCall(VecScale(Q, 1. / Qrefnorm)); + PetscCall(VecNorm(Q_ref, NORM_MAX, &norm_Q)); + PetscCall(VecNorm(Q_ref, NORM_MAX, &norm_Q_ref)); + PetscCall(VecAXPY(Q, -1.0, Q_ref)); + PetscCall(VecScale(Q, 1. / norm_Q_ref)); PetscCall(VecNorm(Q, NORM_MAX, &error)); // Check error if (error > app_ctx->test_tol) { - PetscCall(PetscPrintf(PETSC_COMM_WORLD, "Test failed with error norm %g\n", (double)error)); + PetscCall(PetscPrintf(PETSC_COMM_WORLD, "Test failed with error norm %g\nReference solution max norm: %g Computed solution max norm %g\n", + (double)error, (double)norm_Q_ref, (double)norm_Q)); } // Cleanup PetscCall(PetscViewerDestroy(&viewer)); - PetscCall(VecDestroy(&Qref)); + PetscCall(VecDestroy(&Q_ref)); PetscFunctionReturn(PETSC_SUCCESS); } @@ -200,7 +204,7 @@ PetscErrorCode PostProcess(TS ts, CeedData ceed_data, DM dm, ProblemData problem PetscFunctionBeginUser; // Print relative error - if (problem->non_zero_time && user->app_ctx->test_type == TESTTYPE_NONE) { + if (problem->compute_exact_solution_error && user->app_ctx->test_type == TESTTYPE_NONE) { PetscCall(PrintError(ceed_data, dm, user, Q, final_time)); } @@ -340,65 +344,35 @@ PetscErrorCode PhastaDatFileGetNRows(const MPI_Comm comm, const char path[PETSC_ PetscErrorCode PhastaDatFileReadToArrayReal(MPI_Comm comm, const char path[PETSC_MAX_PATH_LEN], PetscReal array[]) { PetscInt dims[2]; - int ndims; FILE *fp; const PetscInt char_array_len = 512; char line[char_array_len]; - char **row_array; PetscFunctionBeginUser; PetscCall(PhastaDatFileOpen(comm, path, char_array_len, dims, &fp)); for (PetscInt i = 0; i < dims[0]; i++) { + int ndims; + char **row_array; + PetscCall(PetscSynchronizedFGets(comm, fp, char_array_len, line)); PetscCall(PetscStrToArray(line, ' ', &ndims, &row_array)); PetscCheck(ndims == dims[1], comm, PETSC_ERR_FILE_UNEXPECTED, "Line %" PetscInt_FMT " of %s does not contain enough columns (%d instead of %" PetscInt_FMT ")", i, path, ndims, dims[1]); - for (PetscInt j = 0; j < dims[1]; j++) { - array[i * dims[1] + j] = (PetscReal)atof(row_array[j]); - } + for (PetscInt j = 0; j < dims[1]; j++) array[i * dims[1] + j] = (PetscReal)atof(row_array[j]); + PetscCall(PetscStrToArrayDestroy(ndims, row_array)); } PetscCall(PetscFClose(comm, fp)); PetscFunctionReturn(PETSC_SUCCESS); } -PetscLogEvent FLUIDS_CeedOperatorApply; -PetscLogEvent FLUIDS_CeedOperatorAssemble; -PetscLogEvent FLUIDS_CeedOperatorAssembleDiagonal; -PetscLogEvent FLUIDS_CeedOperatorAssemblePointBlockDiagonal; -PetscLogEvent FLUIDS_SmartRedis_Init; -PetscLogEvent FLUIDS_SmartRedis_Meta; -PetscLogEvent FLUIDS_SmartRedis_Train; -PetscLogEvent FLUIDS_TrainDataCompute; -PetscLogEvent FLUIDS_DifferentialFilter; -PetscLogEvent FLUIDS_VelocityGradientProjection; -static PetscClassId libCEED_classid, onlineTrain_classid, misc_classid; - -PetscErrorCode RegisterLogEvents() { - PetscFunctionBeginUser; - PetscCall(PetscClassIdRegister("libCEED", &libCEED_classid)); - PetscCall(PetscLogEventRegister("CeedOpApply", libCEED_classid, &FLUIDS_CeedOperatorApply)); - PetscCall(PetscLogEventRegister("CeedOpAsm", libCEED_classid, &FLUIDS_CeedOperatorAssemble)); - PetscCall(PetscLogEventRegister("CeedOpAsmD", libCEED_classid, &FLUIDS_CeedOperatorAssembleDiagonal)); - PetscCall(PetscLogEventRegister("CeedOpAsmPBD", libCEED_classid, &FLUIDS_CeedOperatorAssemblePointBlockDiagonal)); - - PetscCall(PetscClassIdRegister("onlineTrain", &onlineTrain_classid)); - PetscCall(PetscLogEventRegister("SmartRedis_Init", onlineTrain_classid, &FLUIDS_SmartRedis_Init)); - PetscCall(PetscLogEventRegister("SmartRedis_Meta", onlineTrain_classid, &FLUIDS_SmartRedis_Meta)); - PetscCall(PetscLogEventRegister("SmartRedis_Train", onlineTrain_classid, &FLUIDS_SmartRedis_Train)); - PetscCall(PetscLogEventRegister("TrainDataCompute", onlineTrain_classid, &FLUIDS_TrainDataCompute)); - - PetscCall(PetscClassIdRegister("Miscellaneous", &misc_classid)); - PetscCall(PetscLogEventRegister("DiffFilter", misc_classid, &FLUIDS_DifferentialFilter)); - PetscCall(PetscLogEventRegister("VeloGradProj", misc_classid, &FLUIDS_VelocityGradientProjection)); - PetscFunctionReturn(PETSC_SUCCESS); -} - // Print information about the given simulation run -PetscErrorCode PrintRunInfo(User user, Physics phys_ctx, ProblemData problem, MPI_Comm comm) { - Ceed ceed = user->ceed; +PetscErrorCode PrintRunInfo(User user, Physics phys_ctx, ProblemData problem, TS ts) { + Ceed ceed = user->ceed; + MPI_Comm comm = PetscObjectComm((PetscObject)ts); + PetscFunctionBeginUser; // Header and rank char host_name[PETSC_MAX_PATH_LEN]; @@ -427,22 +401,43 @@ PetscErrorCode PrintRunInfo(User user, Physics phys_ctx, ProblemData problem, MP " libCEED Backend MemType : %s\n", used_resource, CeedMemTypes[mem_type_backend])); // PETSc - char box_faces_str[PETSC_MAX_PATH_LEN] = "3,3,3"; + VecType vec_type; + char box_faces_str[PETSC_MAX_PATH_LEN] = "3,3,3"; if (problem->dim == 2) box_faces_str[3] = '\0'; PetscCall(PetscOptionsGetString(NULL, NULL, "-dm_plex_box_faces", box_faces_str, sizeof(box_faces_str), NULL)); - MatType amat_type = user->app_ctx->amat_type, pmat_type; - VecType vec_type; - PetscCall(DMGetMatType(user->dm, &pmat_type)); - if (!amat_type) amat_type = pmat_type; PetscCall(DMGetVecType(user->dm, &vec_type)); PetscCall(PetscPrintf(comm, " PETSc:\n" " Box Faces : %s\n" - " A MatType : %s\n" - " P MatType : %s\n" " DM VecType : %s\n" " Time Stepping Scheme : %s\n", - box_faces_str, amat_type, pmat_type, vec_type, phys_ctx->implicit ? "implicit" : "explicit")); + box_faces_str, vec_type, phys_ctx->implicit ? "implicit" : "explicit")); + { + char pmat_type_str[PETSC_MAX_PATH_LEN]; + MatType amat_type, pmat_type; + Mat Amat, Pmat; + TSIJacobianFn *ijacob_function; + + PetscCall(TSGetIJacobian(ts, &Amat, &Pmat, &ijacob_function, NULL)); + PetscCall(MatGetType(Amat, &amat_type)); + PetscCall(MatGetType(Pmat, &pmat_type)); + + PetscCall(PetscStrncpy(pmat_type_str, pmat_type, sizeof(pmat_type_str))); + if (!strcmp(pmat_type, MATCEED)) { + MatType pmat_coo_type; + char pmat_coo_type_str[PETSC_MAX_PATH_LEN]; + + PetscCall(MatCeedGetCOOMatType(Pmat, &pmat_coo_type)); + PetscCall(PetscSNPrintf(pmat_coo_type_str, sizeof(pmat_coo_type_str), " (COO MatType: %s)", pmat_coo_type)); + PetscCall(PetscStrlcat(pmat_type_str, pmat_coo_type_str, sizeof(pmat_type_str))); + } + if (ijacob_function) { + PetscCall(PetscPrintf(comm, + " IJacobian A MatType : %s\n" + " IJacobian P MatType : %s\n", + amat_type, pmat_type_str)); + } + } if (user->app_ctx->cont_steps) { PetscCall(PetscPrintf(comm, " Continue:\n" @@ -480,9 +475,10 @@ PetscErrorCode PrintRunInfo(User user, Physics phys_ctx, ProblemData problem, MP part_owned_dofs[1] = gather_buffer[comm_size - 1]; // max part_owned_dofs[2] = gather_buffer[median_index]; // median PetscReal part_owned_dof_ratio = (PetscReal)part_owned_dofs[1] / (PetscReal)part_owned_dofs[2]; - PetscCall(PetscPrintf( - comm, " Global Vector %" PetscInt_FMT "-DoF nodes : %" PetscInt_FMT ", %" PetscInt_FMT ", %" PetscInt_FMT ", %f\n", num_comp_q, - part_owned_dofs[0] / num_comp_q, part_owned_dofs[1] / num_comp_q, part_owned_dofs[2] / num_comp_q, part_owned_dof_ratio)); + PetscCall(PetscPrintf(comm, + " Global Vector %" PetscInt_FMT "-DoF nodes : %" PetscInt_FMT ", %" PetscInt_FMT ", %" PetscInt_FMT ", %f\n", + num_comp_q, part_owned_dofs[0] / num_comp_q, part_owned_dofs[1] / num_comp_q, part_owned_dofs[2] / num_comp_q, + part_owned_dof_ratio)); } PetscCallMPI(MPI_Gather(&local_dofs, 1, MPIU_INT, gather_buffer, 1, MPIU_INT, 0, comm)); @@ -492,18 +488,20 @@ PetscErrorCode PrintRunInfo(User user, Physics phys_ctx, ProblemData problem, MP part_local_dofs[1] = gather_buffer[comm_size - 1]; // max part_local_dofs[2] = gather_buffer[median_index]; // median PetscReal part_local_dof_ratio = (PetscReal)part_local_dofs[1] / (PetscReal)part_local_dofs[2]; - PetscCall(PetscPrintf( - comm, " Local Vector %" PetscInt_FMT "-DoF nodes : %" PetscInt_FMT ", %" PetscInt_FMT ", %" PetscInt_FMT ", %f\n", num_comp_q, - part_local_dofs[0] / num_comp_q, part_local_dofs[1] / num_comp_q, part_local_dofs[2] / num_comp_q, part_local_dof_ratio)); + PetscCall(PetscPrintf(comm, + " Local Vector %" PetscInt_FMT "-DoF nodes : %" PetscInt_FMT ", %" PetscInt_FMT ", %" PetscInt_FMT ", %f\n", + num_comp_q, part_local_dofs[0] / num_comp_q, part_local_dofs[1] / num_comp_q, part_local_dofs[2] / num_comp_q, + part_local_dof_ratio)); } if (comm_size != 1) { PetscInt num_remote_roots_total = 0, num_remote_leaves_total = 0, num_ghost_interface_ranks = 0, num_owned_interface_ranks = 0; { PetscSF sf; - PetscInt nrranks, niranks; + PetscMPIInt nrranks, niranks; const PetscInt *roffset, *rmine, *rremote, *ioffset, *irootloc; const PetscMPIInt *rranks, *iranks; + PetscCall(DMGetSectionSF(user->dm, &sf)); PetscCall(PetscSFGetRootRanks(sf, &nrranks, &rranks, &roffset, &rmine, &rremote)); PetscCall(PetscSFGetLeafRanks(sf, &niranks, &iranks, &ioffset, &irootloc)); @@ -525,10 +523,11 @@ PetscErrorCode PrintRunInfo(User user, Physics phys_ctx, ProblemData problem, MP part_boundary_dofs[1] = gather_buffer[comm_size - 1]; // max part_boundary_dofs[2] = gather_buffer[median_index]; // median PetscReal part_shared_dof_ratio = (PetscReal)part_boundary_dofs[1] / (PetscReal)part_boundary_dofs[2]; - PetscCall(PetscPrintf( - comm, " Ghost Interface %" PetscInt_FMT "-DoF nodes : %" PetscInt_FMT ", %" PetscInt_FMT ", %" PetscInt_FMT ", %f\n", - num_comp_q, part_boundary_dofs[0] / num_comp_q, part_boundary_dofs[1] / num_comp_q, part_boundary_dofs[2] / num_comp_q, - part_shared_dof_ratio)); + PetscCall(PetscPrintf(comm, + " Ghost Interface %" PetscInt_FMT "-DoF nodes : %" PetscInt_FMT ", %" PetscInt_FMT ", %" PetscInt_FMT + ", %f\n", + num_comp_q, part_boundary_dofs[0] / num_comp_q, part_boundary_dofs[1] / num_comp_q, part_boundary_dofs[2] / num_comp_q, + part_shared_dof_ratio)); } PetscCallMPI(MPI_Gather(&num_ghost_interface_ranks, 1, MPIU_INT, gather_buffer, 1, MPIU_INT, 0, comm)); @@ -549,10 +548,11 @@ PetscErrorCode PrintRunInfo(User user, Physics phys_ctx, ProblemData problem, MP part_boundary_dofs[1] = gather_buffer[comm_size - 1]; // max part_boundary_dofs[2] = gather_buffer[median_index]; // median PetscReal part_shared_dof_ratio = (PetscReal)part_boundary_dofs[1] / (PetscReal)part_boundary_dofs[2]; - PetscCall(PetscPrintf( - comm, " Owned Interface %" PetscInt_FMT "-DoF nodes : %" PetscInt_FMT ", %" PetscInt_FMT ", %" PetscInt_FMT ", %f\n", - num_comp_q, part_boundary_dofs[0] / num_comp_q, part_boundary_dofs[1] / num_comp_q, part_boundary_dofs[2] / num_comp_q, - part_shared_dof_ratio)); + PetscCall(PetscPrintf(comm, + " Owned Interface %" PetscInt_FMT "-DoF nodes : %" PetscInt_FMT ", %" PetscInt_FMT ", %" PetscInt_FMT + ", %f\n", + num_comp_q, part_boundary_dofs[0] / num_comp_q, part_boundary_dofs[1] / num_comp_q, part_boundary_dofs[2] / num_comp_q, + part_shared_dof_ratio)); } PetscCallMPI(MPI_Gather(&num_owned_interface_ranks, 1, MPIU_INT, gather_buffer, 1, MPIU_INT, 0, comm)); diff --git a/examples/fluids/src/petsc_ops.c b/examples/fluids/src/petsc_ops.c index 3706751db0..786b081b2e 100644 --- a/examples/fluids/src/petsc_ops.c +++ b/examples/fluids/src/petsc_ops.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -71,33 +71,37 @@ PetscErrorCode OperatorApplyContextCreate(DM dm_x, DM dm_y, Ceed ceed, CeedOpera PetscCall(VecGetLocalSize(X_loc, &X_size)); PetscCheck(X_size == x_size, PETSC_COMM_WORLD, PETSC_ERR_ARG_SIZ, "X_loc (%" PetscInt_FMT ") not correct size for CeedOperator active input size (%" CeedSize_FMT ")", X_size, x_size); - if (dm_x) + if (dm_x) { PetscCheck(X_size == dm_X_size, PETSC_COMM_WORLD, PETSC_ERR_ARG_SIZ, "X_loc size (%" PetscInt_FMT ") does not match dm_x local vector size (%" PetscInt_FMT ")", X_size, dm_X_size); + } } if (Y_loc) { PetscCall(VecGetLocalSize(Y_loc, &Y_size)); PetscCheck(Y_size == y_size, PETSC_COMM_WORLD, PETSC_ERR_ARG_SIZ, "Y_loc (%" PetscInt_FMT ") not correct size for CeedOperator active output size (%" CeedSize_FMT ")", Y_size, y_size); - if (dm_y) + if (dm_y) { PetscCheck(Y_size == dm_Y_size, PETSC_COMM_WORLD, PETSC_ERR_ARG_SIZ, "Y_loc size (%" PetscInt_FMT ") does not match dm_y local vector size (%" PetscInt_FMT ")", Y_size, dm_Y_size); + } } if (x_ceed && x_ceed != CEED_VECTOR_NONE) { PetscCallCeed(ceed, CeedVectorGetLength(x_ceed, &x_ceed_size)); PetscCheck(x_size >= 0 ? x_ceed_size == x_size : true, PETSC_COMM_WORLD, PETSC_ERR_ARG_SIZ, "x_ceed (%" CeedSize_FMT ") not correct size for CeedOperator active input size (%" CeedSize_FMT ")", x_ceed_size, x_size); - if (dm_x) + if (dm_x) { PetscCheck(x_ceed_size == dm_X_size, PETSC_COMM_WORLD, PETSC_ERR_ARG_SIZ, "x_ceed size (%" CeedSize_FMT ") does not match dm_x local vector size (%" PetscInt_FMT ")", x_ceed_size, dm_X_size); + } } if (y_ceed && y_ceed != CEED_VECTOR_NONE) { PetscCallCeed(ceed, CeedVectorGetLength(y_ceed, &y_ceed_size)); PetscCheck(y_ceed_size == y_size, PETSC_COMM_WORLD, PETSC_ERR_ARG_SIZ, "y_ceed (%" CeedSize_FMT ") not correct size for CeedOperator active input size (%" CeedSize_FMT ")", y_ceed_size, y_size); - if (dm_y) + if (dm_y) { PetscCheck(y_ceed_size == dm_Y_size, PETSC_COMM_WORLD, PETSC_ERR_ARG_SIZ, "y_ceed size (%" CeedSize_FMT ") does not match dm_y local vector size (%" PetscInt_FMT ")", y_ceed_size, dm_Y_size); + } } } @@ -176,9 +180,12 @@ VecType DMReturnVecType(DM dm) { PetscErrorCode CeedOperatorCreateLocalVecs(CeedOperator op, VecType vec_type, MPI_Comm comm, Vec *input, Vec *output) { CeedSize input_size, output_size; Ceed ceed; + int comm_size; PetscFunctionBeginUser; PetscCall(CeedOperatorGetCeed(op, &ceed)); + PetscCallMPI(MPI_Comm_size(comm, &comm_size)); + PetscCheck(comm_size == 1, PETSC_COMM_WORLD, PETSC_ERR_ARG_SIZ, "MPI_Comm must be of size 1, recieved comm of size %d", comm_size); PetscCallCeed(ceed, CeedOperatorGetActiveVectorLengths(op, &input_size, &output_size)); if (input) { PetscCall(VecCreate(comm, input)); @@ -190,6 +197,7 @@ PetscErrorCode CeedOperatorCreateLocalVecs(CeedOperator op, VecType vec_type, MP PetscCall(VecSetType(*output, vec_type)); PetscCall(VecSetSizes(*output, output_size, output_size)); } + PetscCheck(CeedDestroy(&ceed) == CEED_ERROR_SUCCESS, comm, PETSC_ERR_LIB, "Destroying Ceed object failed"); PetscFunctionReturn(PETSC_SUCCESS); } @@ -306,10 +314,8 @@ PetscErrorCode ApplyAddCeedOperatorLocalToLocal(Vec X_loc, Vec Y_loc, OperatorAp */ PetscErrorCode CreateSolveOperatorsFromMatCeed(KSP ksp, Mat mat_ceed, PetscBool assemble, Mat *Amat, Mat *Pmat) { PetscBool use_matceed_pmat, assemble_amat = PETSC_FALSE; - MatType mat_ceed_inner_type; PetscFunctionBeginUser; - PetscCall(MatCeedGetInnerMatType(mat_ceed, &mat_ceed_inner_type)); { // Determine if Amat should be MATCEED or assembled const char *ksp_prefix = NULL; @@ -320,7 +326,7 @@ PetscErrorCode CreateSolveOperatorsFromMatCeed(KSP ksp, Mat mat_ceed, PetscBool } if (assemble_amat) { - PetscCall(MatConvert(mat_ceed, mat_ceed_inner_type, MAT_INITIAL_MATRIX, Amat)); + PetscCall(MatCeedCreateMatCOO(mat_ceed, Amat)); if (assemble) PetscCall(MatCeedAssembleCOO(mat_ceed, *Amat)); PetscCall(PetscObjectReference((PetscObject)*Amat)); @@ -337,14 +343,14 @@ PetscErrorCode CreateSolveOperatorsFromMatCeed(KSP ksp, Mat mat_ceed, PetscBool PetscCall(KSPGetPC(ksp, &pc)); PetscCall(PCGetType(pc, &pc_type)); - PetscCall(PetscStrcmpAny(pc_type, &use_matceed_pmat, PCJACOBI, PCVPBJACOBI, PCPBJACOBI, "")); + PetscCall(PetscStrcmpAny(pc_type, &use_matceed_pmat, PCNONE, PCJACOBI, PCVPBJACOBI, PCPBJACOBI, "")); } if (use_matceed_pmat) { PetscCall(PetscObjectReference((PetscObject)mat_ceed)); *Pmat = mat_ceed; } else { - PetscCall(MatConvert(mat_ceed, mat_ceed_inner_type, MAT_INITIAL_MATRIX, Pmat)); + PetscCall(MatCeedCreateMatCOO(mat_ceed, Pmat)); if (assemble) PetscCall(MatCeedAssembleCOO(mat_ceed, *Pmat)); } PetscFunctionReturn(PETSC_SUCCESS); diff --git a/examples/fluids/src/qdata.c b/examples/fluids/src/qdata.c new file mode 100644 index 0000000000..4288220a6b --- /dev/null +++ b/examples/fluids/src/qdata.c @@ -0,0 +1,199 @@ +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. +// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +// +// SPDX-License-Identifier: BSD-2-Clause +// +// This file is part of CEED: http://github.com/ceed + +#include "../navierstokes.h" + +#include +#include "../qfunctions/setupgeo.h" +#include "../qfunctions/setupgeo2d.h" + +/** + * @brief Get number of components of quadrature data for domain + * + * @param[in] dm DM where quadrature data would be used + * @param[out] q_data_size Number of components of quadrature data + */ +PetscErrorCode QDataGetNumComponents(DM dm, CeedInt *q_data_size) { + PetscInt num_comp_x, dim; + + PetscFunctionBeginUser; + PetscCall(DMGetDimension(dm, &dim)); + { // Get number of coordinate components + DM dm_coord; + PetscSection section_coord; + PetscInt field = 0; // Default field has the coordinates + PetscCall(DMGetCoordinateDM(dm, &dm_coord)); + PetscCall(DMGetLocalSection(dm_coord, §ion_coord)); + PetscCall(PetscSectionGetFieldComponents(section_coord, field, &num_comp_x)); + } + switch (dim) { + case 2: + switch (num_comp_x) { + case 2: + *q_data_size = 5; + break; + case 3: + *q_data_size = 7; + break; + default: + SETERRQ(PetscObjectComm((PetscObject)dm), PETSC_ERR_SUP, + "QData not valid for DM of dimension %" PetscInt_FMT " and coordinates with dimension %" PetscInt_FMT, dim, num_comp_x); + break; + } + break; + case 3: + *q_data_size = 10; + break; + default: + SETERRQ(PetscObjectComm((PetscObject)dm), PETSC_ERR_SUP, + "QData not valid for DM of dimension %" PetscInt_FMT " and coordinates with dimension %" PetscInt_FMT, dim, num_comp_x); + break; + } + PetscFunctionReturn(PETSC_SUCCESS); +} + +/** + * @brief Create quadrature data for domain + * + * @param[in] ceed Ceed object quadrature data will be used with + * @param[in] dm DM where quadrature data would be used + * @param[in] domain_label DMLabel that quadrature data would be used one + * @param[in] label_value Value of label + * @param[in] elem_restr_x CeedElemRestriction of the coordinates (must match `domain_label` and `label_value` selections) + * @param[in] basis_x CeedBasis of the coordinates + * @param[in] x_coord CeedVector of the coordinates + * @param[out] elem_restr_qd CeedElemRestriction of the quadrature data + * @param[out] q_data CeedVector of the quadrature data + * @param[out] q_data_size number of components of quadrature data + */ +PetscErrorCode QDataGet(Ceed ceed, DM dm, DMLabel domain_label, PetscInt label_value, CeedElemRestriction elem_restr_x, CeedBasis basis_x, + CeedVector x_coord, CeedElemRestriction *elem_restr_qd, CeedVector *q_data, CeedInt *q_data_size) { + CeedQFunction qf_setup; + CeedOperator op_setup; + CeedInt num_comp_x; + PetscInt dim, height = 0; + + PetscFunctionBeginUser; + PetscCall(QDataGetNumComponents(dm, q_data_size)); + PetscCallCeed(ceed, CeedElemRestrictionGetNumComponents(elem_restr_x, &num_comp_x)); + PetscCall(DMGetDimension(dm, &dim)); + switch (dim) { + case 2: + switch (num_comp_x) { + case 2: + PetscCallCeed(ceed, CeedQFunctionCreateInterior(ceed, 1, Setup2d, Setup2d_loc, &qf_setup)); + break; + case 3: + PetscCallCeed(ceed, CeedQFunctionCreateInterior(ceed, 1, Setup2D_3Dcoords, Setup2D_3Dcoords_loc, &qf_setup)); + break; + } + break; + case 3: + PetscCallCeed(ceed, CeedQFunctionCreateInterior(ceed, 1, Setup, Setup_loc, &qf_setup)); + break; + } + + // -- Create QFunction for quadrature data + PetscCallCeed(ceed, CeedQFunctionSetUserFlopsEstimate(qf_setup, 0)); + PetscCallCeed(ceed, CeedQFunctionAddInput(qf_setup, "dx", num_comp_x * (dim - height), CEED_EVAL_GRAD)); + PetscCallCeed(ceed, CeedQFunctionAddInput(qf_setup, "weight", 1, CEED_EVAL_WEIGHT)); + PetscCallCeed(ceed, CeedQFunctionAddOutput(qf_setup, "surface qdata", *q_data_size, CEED_EVAL_NONE)); + + PetscCall(DMPlexCeedElemRestrictionQDataCreate(ceed, dm, domain_label, label_value, height, *q_data_size, elem_restr_qd)); + PetscCallCeed(ceed, CeedElemRestrictionCreateVector(*elem_restr_qd, q_data, NULL)); + + PetscCallCeed(ceed, CeedOperatorCreate(ceed, qf_setup, NULL, NULL, &op_setup)); + PetscCallCeed(ceed, CeedOperatorSetField(op_setup, "dx", elem_restr_x, basis_x, CEED_VECTOR_ACTIVE)); + PetscCallCeed(ceed, CeedOperatorSetField(op_setup, "weight", CEED_ELEMRESTRICTION_NONE, basis_x, CEED_VECTOR_NONE)); + PetscCallCeed(ceed, CeedOperatorSetField(op_setup, "surface qdata", *elem_restr_qd, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE)); + + PetscCallCeed(ceed, CeedOperatorApply(op_setup, x_coord, *q_data, CEED_REQUEST_IMMEDIATE)); + + PetscCallCeed(ceed, CeedOperatorDestroy(&op_setup)); + PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_setup)); + PetscFunctionReturn(PETSC_SUCCESS); +} + +/** + * @brief Get number of components of quadrature data for boundary of domain + * + * @param[in] dm DM where quadrature data would be used + * @param[out] q_data_size Number of components of quadrature data + */ +PetscErrorCode QDataBoundaryGetNumComponents(DM dm, CeedInt *q_data_size) { + PetscInt dim; + + PetscFunctionBeginUser; + PetscCall(DMGetDimension(dm, &dim)); + switch (dim) { + case 2: + *q_data_size = 3; + break; + case 3: + *q_data_size = 10; + break; + default: + SETERRQ(PetscObjectComm((PetscObject)dm), PETSC_ERR_SUP, "QDataBoundary not valid for DM of dimension %" PetscInt_FMT, dim); + break; + } + PetscFunctionReturn(PETSC_SUCCESS); +} + +/** + * @brief Create quadrature data for boundary of domain + * + * @param[in] ceed Ceed object quadrature data will be used with + * @param[in] dm DM where quadrature data would be used + * @param[in] domain_label DMLabel that quadrature data would be used one + * @param[in] label_value Value of label + * @param[in] elem_restr_x CeedElemRestriction of the coordinates (must match `domain_label` and `label_value` selections) + * @param[in] basis_x CeedBasis of the coordinates + * @param[in] x_coord CeedVector of the coordinates + * @param[out] elem_restr_qd CeedElemRestriction of the quadrature data + * @param[out] q_data CeedVector of the quadrature data + * @param[out] q_data_size number of components of quadrature data + */ +PetscErrorCode QDataBoundaryGet(Ceed ceed, DM dm, DMLabel domain_label, PetscInt label_value, CeedElemRestriction elem_restr_x, CeedBasis basis_x, + CeedVector x_coord, CeedElemRestriction *elem_restr_qd, CeedVector *q_data, CeedInt *q_data_size) { + CeedQFunction qf_setup_sur; + CeedOperator op_setup_sur; + CeedInt num_comp_x; + PetscInt dim, height = 1; + + PetscFunctionBeginUser; + PetscCall(QDataBoundaryGetNumComponents(dm, q_data_size)); + PetscCallCeed(ceed, CeedElemRestrictionGetNumComponents(elem_restr_x, &num_comp_x)); + PetscCall(DMGetDimension(dm, &dim)); + switch (dim) { + case 2: + PetscCallCeed(ceed, CeedQFunctionCreateInterior(ceed, 1, SetupBoundary2d, SetupBoundary2d_loc, &qf_setup_sur)); + break; + case 3: + PetscCallCeed(ceed, CeedQFunctionCreateInterior(ceed, 1, SetupBoundary, SetupBoundary_loc, &qf_setup_sur)); + break; + } + + // -- Create QFunction for quadrature data + PetscCallCeed(ceed, CeedQFunctionSetUserFlopsEstimate(qf_setup_sur, 0)); + PetscCallCeed(ceed, CeedQFunctionAddInput(qf_setup_sur, "dx", num_comp_x * (dim - height), CEED_EVAL_GRAD)); + PetscCallCeed(ceed, CeedQFunctionAddInput(qf_setup_sur, "weight", 1, CEED_EVAL_WEIGHT)); + PetscCallCeed(ceed, CeedQFunctionAddOutput(qf_setup_sur, "surface qdata", *q_data_size, CEED_EVAL_NONE)); + + PetscCall(DMPlexCeedElemRestrictionQDataCreate(ceed, dm, domain_label, label_value, height, *q_data_size, elem_restr_qd)); + PetscCallCeed(ceed, CeedElemRestrictionCreateVector(*elem_restr_qd, q_data, NULL)); + + PetscCallCeed(ceed, CeedOperatorCreate(ceed, qf_setup_sur, NULL, NULL, &op_setup_sur)); + PetscCallCeed(ceed, CeedOperatorSetField(op_setup_sur, "dx", elem_restr_x, basis_x, CEED_VECTOR_ACTIVE)); + PetscCallCeed(ceed, CeedOperatorSetField(op_setup_sur, "weight", CEED_ELEMRESTRICTION_NONE, basis_x, CEED_VECTOR_NONE)); + PetscCallCeed(ceed, CeedOperatorSetField(op_setup_sur, "surface qdata", *elem_restr_qd, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE)); + + PetscCallCeed(ceed, CeedOperatorApply(op_setup_sur, x_coord, *q_data, CEED_REQUEST_IMMEDIATE)); + + PetscCallCeed(ceed, CeedOperatorDestroy(&op_setup_sur)); + PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_setup_sur)); + PetscFunctionReturn(PETSC_SUCCESS); +} diff --git a/examples/fluids/src/setupdm.c b/examples/fluids/src/setupdm.c index a0df7dfe69..25573ca63d 100644 --- a/examples/fluids/src/setupdm.c +++ b/examples/fluids/src/setupdm.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -52,24 +52,19 @@ PetscErrorCode SetUpDM(DM dm, ProblemData problem, PetscInt degree, PetscInt q_e DMLabel label; PetscCall(DMGetLabel(dm, "Face Sets", &label)); PetscCall(DMPlexLabelComplete(dm, label)); - // Set wall BCs - if (bc->num_wall > 0) { - PetscCall(DMAddBoundary(dm, DM_BC_ESSENTIAL, "wall", label, bc->num_wall, bc->walls, 0, bc->num_comps, bc->wall_comps, NULL, NULL, NULL, NULL)); - } - // Set symmetry BCs in the x direction - if (bc->num_symmetry[0] > 0) { - PetscInt comps[1] = {1}; - PetscCall(DMAddBoundary(dm, DM_BC_ESSENTIAL, "symmetry_x", label, bc->num_symmetry[0], bc->symmetries[0], 0, 1, comps, NULL, NULL, NULL, NULL)); - } - // Set symmetry BCs in the y direction - if (bc->num_symmetry[1] > 0) { - PetscInt comps[1] = {2}; - PetscCall(DMAddBoundary(dm, DM_BC_ESSENTIAL, "symmetry_y", label, bc->num_symmetry[1], bc->symmetries[1], 0, 1, comps, NULL, NULL, NULL, NULL)); - } - // Set symmetry BCs in the z direction - if (bc->num_symmetry[2] > 0) { - PetscInt comps[1] = {3}; - PetscCall(DMAddBoundary(dm, DM_BC_ESSENTIAL, "symmetry_z", label, bc->num_symmetry[2], bc->symmetries[2], 0, 1, comps, NULL, NULL, NULL, NULL)); + + for (PetscInt i = 0; i < problem->num_bc_defs; i++) { + BCDefinition bc_def = problem->bc_defs[i]; + PetscInt num_essential_comps, num_label_values; + const PetscInt *essential_comps, *label_values; + const char *name; + + PetscCall(BCDefinitionGetEssential(bc_def, &num_essential_comps, &essential_comps)); + if (essential_comps > 0) { + PetscCall(BCDefinitionGetInfo(bc_def, &name, &num_label_values, &label_values)); + PetscCall(DMAddBoundary(dm, DM_BC_ESSENTIAL, name, label, num_label_values, label_values, 0, num_essential_comps, essential_comps, NULL, NULL, + NULL, NULL)); + } } { PetscBool use_strongstg = PETSC_FALSE; @@ -100,6 +95,14 @@ PetscErrorCode SetUpDM(DM dm, ProblemData problem, PetscInt degree, PetscInt q_e PetscCall(PetscSectionSetComponentName(section, 0, 3, "VelocityZ")); PetscCall(PetscSectionSetComponentName(section, 0, 4, "Temperature")); break; + + case STATEVAR_ENTROPY: + PetscCall(PetscSectionSetComponentName(section, 0, 0, "EntropyDensity")); + PetscCall(PetscSectionSetComponentName(section, 0, 1, "EntropyMomentumX")); + PetscCall(PetscSectionSetComponentName(section, 0, 2, "EntropyMomentumY")); + PetscCall(PetscSectionSetComponentName(section, 0, 3, "EntropyMomentumZ")); + PetscCall(PetscSectionSetComponentName(section, 0, 4, "EntropyTotalEnergy")); + break; } PetscFunctionReturn(PETSC_SUCCESS); } diff --git a/examples/fluids/src/setuplibceed.c b/examples/fluids/src/setuplibceed.c index 18630c0279..bb801aa269 100644 --- a/examples/fluids/src/setuplibceed.c +++ b/examples/fluids/src/setuplibceed.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -28,14 +28,12 @@ static PetscErrorCode CreateKSPMassOperator_Unstabilized(User user, CeedOperator CeedOperatorField field; PetscInt sub_op_index = 0; // will be 0 for the volume op - PetscCallCeed(ceed, CeedCompositeOperatorGetSubList(user->op_rhs_ctx->op, &sub_ops)); + PetscCallCeed(ceed, CeedOperatorCompositeGetSubList(user->op_rhs_ctx->op, &sub_ops)); PetscCallCeed(ceed, CeedOperatorGetFieldByName(sub_ops[sub_op_index], "q", &field)); - PetscCallCeed(ceed, CeedOperatorFieldGetElemRestriction(field, &elem_restr_q)); - PetscCallCeed(ceed, CeedOperatorFieldGetBasis(field, &basis_q)); + PetscCallCeed(ceed, CeedOperatorFieldGetData(field, NULL, &elem_restr_q, &basis_q, NULL)); PetscCallCeed(ceed, CeedOperatorGetFieldByName(sub_ops[sub_op_index], "qdata", &field)); - PetscCallCeed(ceed, CeedOperatorFieldGetElemRestriction(field, &elem_restr_qd_i)); - PetscCallCeed(ceed, CeedOperatorFieldGetVector(field, &q_data)); + PetscCallCeed(ceed, CeedOperatorFieldGetData(field, NULL, &elem_restr_qd_i, NULL, &q_data)); } PetscCallCeed(ceed, CeedElemRestrictionGetNumComponents(elem_restr_q, &num_comp_q)); @@ -47,6 +45,10 @@ static PetscErrorCode CreateKSPMassOperator_Unstabilized(User user, CeedOperator PetscCallCeed(ceed, CeedOperatorSetField(*op_mass, "qdata", elem_restr_qd_i, CEED_BASIS_NONE, q_data)); PetscCallCeed(ceed, CeedOperatorSetField(*op_mass, "v", elem_restr_q, basis_q, CEED_VECTOR_ACTIVE)); + PetscCallCeed(ceed, CeedVectorDestroy(&q_data)); + PetscCallCeed(ceed, CeedElemRestrictionDestroy(&elem_restr_q)); + PetscCallCeed(ceed, CeedElemRestrictionDestroy(&elem_restr_qd_i)); + PetscCallCeed(ceed, CeedBasisDestroy(&basis_q)); PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_mass)); PetscFunctionReturn(PETSC_SUCCESS); } @@ -68,7 +70,7 @@ static PetscErrorCode CreateKSPMass(User user, ProblemData problem) { PetscCall(DMCreateLocalVector(dm, &Zeros_loc)); PetscCall(VecZeroEntries(Zeros_loc)); - PetscCall(MatCeedCreate(dm, dm, op_mass, NULL, &mat_mass)); + PetscCall(MatCreateCeed(dm, dm, op_mass, NULL, &mat_mass)); PetscCall(MatCeedSetLocalVectors(mat_mass, Zeros_loc, NULL)); PetscCall(KSPCreate(comm, &user->mass_ksp)); @@ -81,7 +83,6 @@ static PetscErrorCode CreateKSPMass(User user, ProblemData problem) { PetscCall(KSPSetType(user->mass_ksp, KSPPREONLY)); } PetscCall(KSPSetFromOptions_WithMatCeed(user->mass_ksp, mat_mass)); - PetscCall(KSPSetFromOptions(user->mass_ksp)); PetscCall(VecDestroy(&Zeros_loc)); PetscCall(MatDestroy(&mat_mass)); } @@ -90,128 +91,66 @@ static PetscErrorCode CreateKSPMass(User user, ProblemData problem) { PetscFunctionReturn(PETSC_SUCCESS); } -PetscErrorCode AddBCSubOperator(Ceed ceed, DM dm, CeedData ceed_data, DMLabel domain_label, PetscInt label_value, CeedInt height, CeedInt Q_sur, - CeedInt q_data_size_sur, CeedInt jac_data_size_sur, CeedQFunction qf_apply_bc, CeedQFunction qf_apply_bc_jacobian, - CeedOperator *op_apply, CeedOperator *op_apply_ijacobian) { - CeedVector q_data_sur, jac_data_sur = NULL; - CeedOperator op_setup_sur, op_apply_bc, op_apply_bc_jacobian = NULL; +static PetscErrorCode AddBCSubOperator(Ceed ceed, DM dm, CeedData ceed_data, DMLabel domain_label, PetscInt label_value, CeedInt height, + CeedInt Q_sur, CeedInt q_data_size_sur, CeedInt jac_data_size_sur, CeedBasis basis_q_sur, + CeedBasis basis_x_sur, CeedQFunction qf_apply_bc, CeedQFunction qf_apply_bc_jacobian, CeedOperator op_apply, + CeedOperator op_apply_ijacobian) { + CeedVector q_data_sur, jac_data_sur = NULL; + CeedOperator op_apply_bc, op_apply_bc_jacobian = NULL; CeedElemRestriction elem_restr_x_sur, elem_restr_q_sur, elem_restr_qd_i_sur, elem_restr_jd_i_sur = NULL; - CeedInt num_qpts_sur, dm_field = 0; + PetscInt dm_field = 0; PetscFunctionBeginUser; - // --- Get number of quadrature points for the boundaries - PetscCallCeed(ceed, CeedBasisGetNumQuadraturePoints(ceed_data->basis_q_sur, &num_qpts_sur)); - - // ---- CEED Restriction PetscCall(DMPlexCeedElemRestrictionCreate(ceed, dm, domain_label, label_value, height, dm_field, &elem_restr_q_sur)); PetscCall(DMPlexCeedElemRestrictionCoordinateCreate(ceed, dm, domain_label, label_value, height, &elem_restr_x_sur)); - PetscCall(DMPlexCeedElemRestrictionQDataCreate(ceed, dm, domain_label, label_value, height, q_data_size_sur, &elem_restr_qd_i_sur)); if (jac_data_size_sur > 0) { // State-dependent data will be passed from residual to Jacobian. This will be collocated. PetscCall(DMPlexCeedElemRestrictionQDataCreate(ceed, dm, domain_label, label_value, height, jac_data_size_sur, &elem_restr_jd_i_sur)); PetscCallCeed(ceed, CeedElemRestrictionCreateVector(elem_restr_jd_i_sur, &jac_data_sur, NULL)); } - // ---- CEED Vector - CeedInt loc_num_elem_sur; - PetscCallCeed(ceed, CeedElemRestrictionGetNumElements(elem_restr_q_sur, &loc_num_elem_sur)); - PetscCallCeed(ceed, CeedVectorCreate(ceed, q_data_size_sur * loc_num_elem_sur * num_qpts_sur, &q_data_sur)); - - // ---- CEED Operator - // ----- CEED Operator for Setup (geometric factors) - PetscCallCeed(ceed, CeedOperatorCreate(ceed, ceed_data->qf_setup_sur, NULL, NULL, &op_setup_sur)); - PetscCallCeed(ceed, CeedOperatorSetField(op_setup_sur, "dx", elem_restr_x_sur, ceed_data->basis_x_sur, CEED_VECTOR_ACTIVE)); - PetscCallCeed(ceed, CeedOperatorSetField(op_setup_sur, "weight", CEED_ELEMRESTRICTION_NONE, ceed_data->basis_x_sur, CEED_VECTOR_NONE)); - PetscCallCeed(ceed, CeedOperatorSetField(op_setup_sur, "surface qdata", elem_restr_qd_i_sur, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE)); + PetscCall(QDataBoundaryGet(ceed, dm, domain_label, label_value, elem_restr_x_sur, basis_x_sur, ceed_data->x_coord, &elem_restr_qd_i_sur, + &q_data_sur, &q_data_size_sur)); - // ----- CEED Operator for Physics + // CEED Operator for Physics PetscCallCeed(ceed, CeedOperatorCreate(ceed, qf_apply_bc, NULL, NULL, &op_apply_bc)); - PetscCallCeed(ceed, CeedOperatorSetField(op_apply_bc, "q", elem_restr_q_sur, ceed_data->basis_q_sur, CEED_VECTOR_ACTIVE)); - PetscCallCeed(ceed, CeedOperatorSetField(op_apply_bc, "Grad_q", elem_restr_q_sur, ceed_data->basis_q_sur, CEED_VECTOR_ACTIVE)); + PetscCallCeed(ceed, CeedOperatorSetField(op_apply_bc, "q", elem_restr_q_sur, basis_q_sur, CEED_VECTOR_ACTIVE)); + PetscCallCeed(ceed, CeedOperatorSetField(op_apply_bc, "Grad_q", elem_restr_q_sur, basis_q_sur, CEED_VECTOR_ACTIVE)); PetscCallCeed(ceed, CeedOperatorSetField(op_apply_bc, "surface qdata", elem_restr_qd_i_sur, CEED_BASIS_NONE, q_data_sur)); - PetscCallCeed(ceed, CeedOperatorSetField(op_apply_bc, "x", elem_restr_x_sur, ceed_data->basis_x_sur, ceed_data->x_coord)); - PetscCallCeed(ceed, CeedOperatorSetField(op_apply_bc, "v", elem_restr_q_sur, ceed_data->basis_q_sur, CEED_VECTOR_ACTIVE)); - if (elem_restr_jd_i_sur) + PetscCallCeed(ceed, CeedOperatorSetField(op_apply_bc, "x", elem_restr_x_sur, basis_x_sur, ceed_data->x_coord)); + PetscCallCeed(ceed, CeedOperatorSetField(op_apply_bc, "v", elem_restr_q_sur, basis_q_sur, CEED_VECTOR_ACTIVE)); + if (elem_restr_jd_i_sur) { PetscCallCeed(ceed, CeedOperatorSetField(op_apply_bc, "surface jacobian data", elem_restr_jd_i_sur, CEED_BASIS_NONE, jac_data_sur)); + } if (qf_apply_bc_jacobian && elem_restr_jd_i_sur) { PetscCallCeed(ceed, CeedOperatorCreate(ceed, qf_apply_bc_jacobian, NULL, NULL, &op_apply_bc_jacobian)); - PetscCallCeed(ceed, CeedOperatorSetField(op_apply_bc_jacobian, "dq", elem_restr_q_sur, ceed_data->basis_q_sur, CEED_VECTOR_ACTIVE)); - PetscCallCeed(ceed, CeedOperatorSetField(op_apply_bc_jacobian, "Grad_dq", elem_restr_q_sur, ceed_data->basis_q_sur, CEED_VECTOR_ACTIVE)); + PetscCallCeed(ceed, CeedOperatorSetField(op_apply_bc_jacobian, "dq", elem_restr_q_sur, basis_q_sur, CEED_VECTOR_ACTIVE)); + PetscCallCeed(ceed, CeedOperatorSetField(op_apply_bc_jacobian, "Grad_dq", elem_restr_q_sur, basis_q_sur, CEED_VECTOR_ACTIVE)); PetscCallCeed(ceed, CeedOperatorSetField(op_apply_bc_jacobian, "surface qdata", elem_restr_qd_i_sur, CEED_BASIS_NONE, q_data_sur)); - PetscCallCeed(ceed, CeedOperatorSetField(op_apply_bc_jacobian, "x", elem_restr_x_sur, ceed_data->basis_x_sur, ceed_data->x_coord)); + PetscCallCeed(ceed, CeedOperatorSetField(op_apply_bc_jacobian, "x", elem_restr_x_sur, basis_x_sur, ceed_data->x_coord)); PetscCallCeed(ceed, CeedOperatorSetField(op_apply_bc_jacobian, "surface jacobian data", elem_restr_jd_i_sur, CEED_BASIS_NONE, jac_data_sur)); - PetscCallCeed(ceed, CeedOperatorSetField(op_apply_bc_jacobian, "v", elem_restr_q_sur, ceed_data->basis_q_sur, CEED_VECTOR_ACTIVE)); + PetscCallCeed(ceed, CeedOperatorSetField(op_apply_bc_jacobian, "v", elem_restr_q_sur, basis_q_sur, CEED_VECTOR_ACTIVE)); } - // ----- Apply CEED operator for Setup - PetscCallCeed(ceed, CeedOperatorApply(op_setup_sur, ceed_data->x_coord, q_data_sur, CEED_REQUEST_IMMEDIATE)); - - // ----- Apply Sub-Operator for Physics - PetscCallCeed(ceed, CeedCompositeOperatorAddSub(*op_apply, op_apply_bc)); - if (op_apply_bc_jacobian) PetscCallCeed(ceed, CeedCompositeOperatorAddSub(*op_apply_ijacobian, op_apply_bc_jacobian)); + // Apply Sub-Operator for Physics + PetscCallCeed(ceed, CeedOperatorCompositeAddSub(op_apply, op_apply_bc)); + if (op_apply_bc_jacobian) PetscCallCeed(ceed, CeedOperatorCompositeAddSub(op_apply_ijacobian, op_apply_bc_jacobian)); - // ----- Cleanup PetscCallCeed(ceed, CeedVectorDestroy(&q_data_sur)); PetscCallCeed(ceed, CeedVectorDestroy(&jac_data_sur)); PetscCallCeed(ceed, CeedElemRestrictionDestroy(&elem_restr_q_sur)); PetscCallCeed(ceed, CeedElemRestrictionDestroy(&elem_restr_x_sur)); PetscCallCeed(ceed, CeedElemRestrictionDestroy(&elem_restr_qd_i_sur)); PetscCallCeed(ceed, CeedElemRestrictionDestroy(&elem_restr_jd_i_sur)); - PetscCallCeed(ceed, CeedOperatorDestroy(&op_setup_sur)); PetscCallCeed(ceed, CeedOperatorDestroy(&op_apply_bc)); PetscCallCeed(ceed, CeedOperatorDestroy(&op_apply_bc_jacobian)); PetscFunctionReturn(PETSC_SUCCESS); } -// Utility function to create CEED Composite Operator for the entire domain -PetscErrorCode CreateOperatorForDomain(Ceed ceed, DM dm, SimpleBC bc, CeedData ceed_data, Physics phys, CeedOperator op_apply_vol, - CeedOperator op_apply_ijacobian_vol, CeedInt height, CeedInt P_sur, CeedInt Q_sur, CeedInt q_data_size_sur, - CeedInt jac_data_size_sur, CeedOperator *op_apply, CeedOperator *op_apply_ijacobian) { - DMLabel domain_label; - - PetscFunctionBeginUser; - // Create Composite Operaters - PetscCallCeed(ceed, CeedCompositeOperatorCreate(ceed, op_apply)); - if (op_apply_ijacobian) PetscCallCeed(ceed, CeedCompositeOperatorCreate(ceed, op_apply_ijacobian)); - - // --Apply Sub-Operator for the volume - PetscCallCeed(ceed, CeedCompositeOperatorAddSub(*op_apply, op_apply_vol)); - if (op_apply_ijacobian) PetscCallCeed(ceed, CeedCompositeOperatorAddSub(*op_apply_ijacobian, op_apply_ijacobian_vol)); - - // -- Create Sub-Operator for in/outflow BCs - PetscCall(DMGetLabel(dm, "Face Sets", &domain_label)); - - // --- Create Sub-Operator for inflow boundaries - for (CeedInt i = 0; i < bc->num_inflow; i++) { - PetscCall(AddBCSubOperator(ceed, dm, ceed_data, domain_label, bc->inflows[i], height, Q_sur, q_data_size_sur, jac_data_size_sur, - ceed_data->qf_apply_inflow, ceed_data->qf_apply_inflow_jacobian, op_apply, op_apply_ijacobian)); - } - // --- Create Sub-Operator for outflow boundaries - for (CeedInt i = 0; i < bc->num_outflow; i++) { - PetscCall(AddBCSubOperator(ceed, dm, ceed_data, domain_label, bc->outflows[i], height, Q_sur, q_data_size_sur, jac_data_size_sur, - ceed_data->qf_apply_outflow, ceed_data->qf_apply_outflow_jacobian, op_apply, op_apply_ijacobian)); - } - // --- Create Sub-Operator for freestream boundaries - for (CeedInt i = 0; i < bc->num_freestream; i++) { - PetscCall(AddBCSubOperator(ceed, dm, ceed_data, domain_label, bc->freestreams[i], height, Q_sur, q_data_size_sur, jac_data_size_sur, - ceed_data->qf_apply_freestream, ceed_data->qf_apply_freestream_jacobian, op_apply, op_apply_ijacobian)); - } - // --- Create Sub-Operator for slip boundaries - for (CeedInt i = 0; i < bc->num_slip; i++) { - PetscCall(AddBCSubOperator(ceed, dm, ceed_data, domain_label, bc->slips[i], height, Q_sur, q_data_size_sur, jac_data_size_sur, - ceed_data->qf_apply_slip, ceed_data->qf_apply_slip_jacobian, op_apply, op_apply_ijacobian)); - } - - // ----- Get Context Labels for Operator - PetscCallCeed(ceed, CeedOperatorGetContextFieldLabel(*op_apply, "solution time", &phys->solution_time_label)); - PetscCallCeed(ceed, CeedOperatorGetContextFieldLabel(*op_apply, "timestep size", &phys->timestep_size_label)); - PetscFunctionReturn(PETSC_SUCCESS); -} - -PetscErrorCode SetupBCQFunctions(Ceed ceed, PetscInt dim_sur, PetscInt num_comp_x, PetscInt num_comp_q, PetscInt q_data_size_sur, - PetscInt jac_data_size_sur, ProblemQFunctionSpec apply_bc, ProblemQFunctionSpec apply_bc_jacobian, - CeedQFunction *qf_apply_bc, CeedQFunction *qf_apply_bc_jacobian) { +static PetscErrorCode SetupBCQFunctions(Ceed ceed, PetscInt dim_sur, PetscInt num_comp_x, PetscInt num_comp_q, PetscInt q_data_size_sur, + PetscInt jac_data_size_sur, ProblemQFunctionSpec apply_bc, ProblemQFunctionSpec apply_bc_jacobian, + CeedQFunction *qf_apply_bc, CeedQFunction *qf_apply_bc_jacobian) { PetscFunctionBeginUser; if (apply_bc.qfunction) { PetscCallCeed(ceed, CeedQFunctionCreateInterior(ceed, 1, apply_bc.qfunction, apply_bc.qfunction_loc, qf_apply_bc)); @@ -238,14 +177,117 @@ PetscErrorCode SetupBCQFunctions(Ceed ceed, PetscInt dim_sur, PetscInt num_comp_ PetscFunctionReturn(PETSC_SUCCESS); } +// Utility function to add boundary operators to the composite operator +static PetscErrorCode AddBCSubOperators(User user, Ceed ceed, DM dm, SimpleBC bc, ProblemData problem, CeedData ceed_data, CeedOperator op_apply, + CeedOperator op_apply_ijacobian) { + CeedInt height = 1, num_comp_q, num_comp_x; + CeedInt P_sur = user->app_ctx->degree + 1, Q_sur = P_sur + user->app_ctx->q_extra, dim_sur, q_data_size_sur; + const CeedInt jac_data_size_sur = user->phys->implicit ? problem->jac_data_size_sur : 0; + PetscInt dim; + DMLabel face_sets_label; + CeedBasis basis_q_sur, basis_x_sur; + + PetscFunctionBeginUser; + PetscCall(DMGetDimension(dm, &dim)); + PetscCall(QDataBoundaryGetNumComponents(dm, &q_data_size_sur)); + dim_sur = dim - height; + { // Get number of components and coordinate dimension from op_apply + CeedOperator *sub_ops; + CeedOperatorField field; + PetscInt sub_op_index = 0; // will be 0 for the volume op + CeedElemRestriction elem_restr_q, elem_restr_x; + + PetscCallCeed(ceed, CeedOperatorCompositeGetSubList(op_apply, &sub_ops)); + PetscCallCeed(ceed, CeedOperatorGetFieldByName(sub_ops[sub_op_index], "q", &field)); + PetscCallCeed(ceed, CeedOperatorFieldGetElemRestriction(field, &elem_restr_q)); + PetscCallCeed(ceed, CeedElemRestrictionGetNumComponents(elem_restr_q, &num_comp_q)); + PetscCallCeed(ceed, CeedElemRestrictionDestroy(&elem_restr_q)); + + PetscCallCeed(ceed, CeedOperatorGetFieldByName(sub_ops[sub_op_index], "x", &field)); + PetscCallCeed(ceed, CeedOperatorFieldGetElemRestriction(field, &elem_restr_x)); + PetscCallCeed(ceed, CeedElemRestrictionGetNumComponents(elem_restr_x, &num_comp_x)); + PetscCallCeed(ceed, CeedElemRestrictionDestroy(&elem_restr_x)); + } + + { // Get bases + DM dm_coord; + + PetscCall(DMGetCoordinateDM(dm, &dm_coord)); + DMLabel label = NULL; + PetscInt label_value = 0; + PetscInt field = 0; + PetscCall(CreateBasisFromPlex(ceed, dm, label, label_value, height, field, &basis_q_sur)); + PetscCall(CreateBasisFromPlex(ceed, dm_coord, label, label_value, height, field, &basis_x_sur)); + } + + PetscCall(DMGetLabel(dm, "Face Sets", &face_sets_label)); + + { // --- Create Sub-Operator for inflow boundaries + CeedQFunction qf_apply_inflow = NULL, qf_apply_inflow_jacobian = NULL; + + PetscCall(SetupBCQFunctions(ceed, dim_sur, num_comp_x, num_comp_q, q_data_size_sur, jac_data_size_sur, problem->apply_inflow, + problem->apply_inflow_jacobian, &qf_apply_inflow, &qf_apply_inflow_jacobian)); + for (CeedInt i = 0; i < bc->num_inflow; i++) { + PetscCall(AddBCSubOperator(ceed, dm, ceed_data, face_sets_label, bc->inflows[i], height, Q_sur, q_data_size_sur, jac_data_size_sur, basis_q_sur, + basis_x_sur, qf_apply_inflow, qf_apply_inflow_jacobian, op_apply, op_apply_ijacobian)); + } + PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_apply_inflow)); + PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_apply_inflow_jacobian)); + } + + { // --- Create Sub-Operator for outflow boundaries + CeedQFunction qf_apply_outflow = NULL, qf_apply_outflow_jacobian = NULL; + + PetscCall(SetupBCQFunctions(ceed, dim_sur, num_comp_x, num_comp_q, q_data_size_sur, jac_data_size_sur, problem->apply_outflow, + problem->apply_outflow_jacobian, &qf_apply_outflow, &qf_apply_outflow_jacobian)); + for (CeedInt i = 0; i < bc->num_outflow; i++) { + PetscCall(AddBCSubOperator(ceed, dm, ceed_data, face_sets_label, bc->outflows[i], height, Q_sur, q_data_size_sur, jac_data_size_sur, + basis_q_sur, basis_x_sur, qf_apply_outflow, qf_apply_outflow_jacobian, op_apply, op_apply_ijacobian)); + } + PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_apply_outflow)); + PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_apply_outflow_jacobian)); + } + + { // --- Create Sub-Operator for freestream boundaries + CeedQFunction qf_apply_freestream = NULL, qf_apply_freestream_jacobian = NULL; + + PetscCall(SetupBCQFunctions(ceed, dim_sur, num_comp_x, num_comp_q, q_data_size_sur, jac_data_size_sur, problem->apply_freestream, + problem->apply_freestream_jacobian, &qf_apply_freestream, &qf_apply_freestream_jacobian)); + for (CeedInt i = 0; i < bc->num_freestream; i++) { + PetscCall(AddBCSubOperator(ceed, dm, ceed_data, face_sets_label, bc->freestreams[i], height, Q_sur, q_data_size_sur, jac_data_size_sur, + basis_q_sur, basis_x_sur, qf_apply_freestream, qf_apply_freestream_jacobian, op_apply, op_apply_ijacobian)); + } + PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_apply_freestream)); + PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_apply_freestream_jacobian)); + } + + { // --- Create Sub-Operator for slip boundaries + CeedQFunction qf_apply_slip = NULL, qf_apply_slip_jacobian = NULL; + + PetscCall(SetupBCQFunctions(ceed, dim_sur, num_comp_x, num_comp_q, q_data_size_sur, jac_data_size_sur, problem->apply_slip, + problem->apply_slip_jacobian, &qf_apply_slip, &qf_apply_slip_jacobian)); + for (CeedInt i = 0; i < bc->num_slip; i++) { + PetscCall(AddBCSubOperator(ceed, dm, ceed_data, face_sets_label, bc->slips[i], height, Q_sur, q_data_size_sur, jac_data_size_sur, basis_q_sur, + basis_x_sur, qf_apply_slip, qf_apply_slip_jacobian, op_apply, op_apply_ijacobian)); + } + PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_apply_slip)); + PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_apply_slip_jacobian)); + } + + PetscCallCeed(ceed, CeedBasisDestroy(&basis_q_sur)); + PetscCallCeed(ceed, CeedBasisDestroy(&basis_x_sur)); + PetscFunctionReturn(PETSC_SUCCESS); +} + PetscErrorCode SetupLibceed(Ceed ceed, CeedData ceed_data, DM dm, User user, AppCtx app_ctx, ProblemData problem, SimpleBC bc) { + const PetscInt num_comp_q = 5; + const CeedInt dim = problem->dim, num_comp_x = problem->dim; + CeedInt jac_data_size_vol = num_comp_q + 6 + 3; + CeedElemRestriction elem_restr_jd_i; + CeedVector jac_data; + CeedOperator op_ifunction_vol = NULL, op_rhs_vol = NULL, op_ijacobian_vol = NULL; + PetscFunctionBeginUser; - // ***************************************************************************** - // Set up CEED objects for the interior domain (volume) - // ***************************************************************************** - const PetscInt num_comp_q = 5; - const CeedInt dim = problem->dim, num_comp_x = problem->dim, q_data_size_vol = problem->q_data_size_vol; - CeedInt jac_data_size_vol = num_comp_q + 6 + 3; if (problem->apply_vol_ifunction.qfunction && problem->uses_newtonian) { NewtonianIdealGasContext gas; @@ -254,265 +296,193 @@ PetscErrorCode SetupLibceed(Ceed ceed, CeedData ceed_data, DM dm, User user, App PetscCallCeed(ceed, CeedQFunctionContextRestoreDataRead(problem->apply_vol_ifunction.qfunction_context, &gas)); } - CeedElemRestriction elem_restr_jd_i; - CeedVector jac_data; - CeedInt num_qpts; - DMLabel domain_label = NULL; - PetscInt label_value = 0, height = 0, dm_field = 0; - - // ----------------------------------------------------------------------------- - // CEED Bases - // ----------------------------------------------------------------------------- - DM dm_coord; - PetscCall(DMGetCoordinateDM(dm, &dm_coord)); - - PetscCall(CreateBasisFromPlex(ceed, dm, domain_label, label_value, height, dm_field, &ceed_data->basis_q)); - PetscCall(CreateBasisFromPlex(ceed, dm_coord, domain_label, label_value, height, dm_field, &ceed_data->basis_x)); - PetscCallCeed(ceed, CeedBasisCreateProjection(ceed_data->basis_x, ceed_data->basis_q, &ceed_data->basis_xc)); - PetscCallCeed(ceed, CeedBasisGetNumQuadraturePoints(ceed_data->basis_q, &num_qpts)); - - // ----------------------------------------------------------------------------- - // CEED Restrictions - // ----------------------------------------------------------------------------- - // -- Create restriction - PetscCall(DMPlexCeedElemRestrictionCreate(ceed, dm, domain_label, label_value, height, 0, &ceed_data->elem_restr_q)); - PetscCall(DMPlexCeedElemRestrictionCoordinateCreate(ceed, dm, domain_label, label_value, height, &ceed_data->elem_restr_x)); - PetscCall(DMPlexCeedElemRestrictionQDataCreate(ceed, dm, domain_label, label_value, height, q_data_size_vol, &ceed_data->elem_restr_qd_i)); - PetscCall(DMPlexCeedElemRestrictionQDataCreate(ceed, dm, domain_label, label_value, height, jac_data_size_vol, &elem_restr_jd_i)); - // -- Create E vectors - PetscCallCeed(ceed, CeedElemRestrictionCreateVector(ceed_data->elem_restr_q, &user->q_ceed, NULL)); - PetscCallCeed(ceed, CeedElemRestrictionCreateVector(ceed_data->elem_restr_q, &user->q_dot_ceed, NULL)); - PetscCallCeed(ceed, CeedElemRestrictionCreateVector(ceed_data->elem_restr_q, &user->g_ceed, NULL)); - - // ----------------------------------------------------------------------------- - // CEED QFunctions - // ----------------------------------------------------------------------------- - // -- Create QFunction for quadrature data - PetscCallCeed(ceed, CeedQFunctionCreateInterior(ceed, 1, problem->setup_vol.qfunction, problem->setup_vol.qfunction_loc, &ceed_data->qf_setup_vol)); - if (problem->setup_vol.qfunction_context) { - PetscCallCeed(ceed, CeedQFunctionSetContext(ceed_data->qf_setup_vol, problem->setup_vol.qfunction_context)); + { // Create bases and element restrictions + DMLabel domain_label = NULL; + PetscInt label_value = 0, height = 0, dm_field = 0; + DM dm_coord; + + PetscCall(DMGetCoordinateDM(dm, &dm_coord)); + PetscCall(CreateBasisFromPlex(ceed, dm, domain_label, label_value, height, dm_field, &ceed_data->basis_q)); + PetscCall(CreateBasisFromPlex(ceed, dm_coord, domain_label, label_value, height, dm_field, &ceed_data->basis_x)); + + PetscCall(DMPlexCeedElemRestrictionCreate(ceed, dm, domain_label, label_value, height, 0, &ceed_data->elem_restr_q)); + PetscCall(DMPlexCeedElemRestrictionCoordinateCreate(ceed, dm, domain_label, label_value, height, &ceed_data->elem_restr_x)); + PetscCall(DMPlexCeedElemRestrictionQDataCreate(ceed, dm, domain_label, label_value, height, jac_data_size_vol, &elem_restr_jd_i)); + + PetscCallCeed(ceed, CeedElemRestrictionCreateVector(ceed_data->elem_restr_q, &user->q_ceed, NULL)); + PetscCallCeed(ceed, CeedElemRestrictionCreateVector(ceed_data->elem_restr_q, &user->q_dot_ceed, NULL)); + PetscCallCeed(ceed, CeedElemRestrictionCreateVector(ceed_data->elem_restr_q, &user->g_ceed, NULL)); + PetscCallCeed(ceed, CeedElemRestrictionCreateVector(ceed_data->elem_restr_x, &ceed_data->x_coord, NULL)); + PetscCallCeed(ceed, CeedElemRestrictionCreateVector(elem_restr_jd_i, &jac_data, NULL)); + + { // -- Copy PETSc coordinate vector into CEED vector + Vec X_loc; + DM cdm; + + PetscCall(DMGetCellCoordinateDM(dm, &cdm)); + if (cdm) { + PetscCall(DMGetCellCoordinatesLocal(dm, &X_loc)); + } else { + PetscCall(DMGetCoordinatesLocal(dm, &X_loc)); + } + PetscCall(VecScale(X_loc, problem->dm_scale)); + PetscCall(VecCopyPetscToCeed(X_loc, ceed_data->x_coord)); + } + + PetscCall(QDataGet(ceed, dm, domain_label, label_value, ceed_data->elem_restr_x, ceed_data->basis_x, ceed_data->x_coord, + &ceed_data->elem_restr_qd_i, &ceed_data->q_data, &problem->q_data_size_vol)); + } + + { // -- Create QFunction for ICs + CeedBasis basis_xc; + CeedQFunction qf_ics; + CeedOperator op_ics; + + PetscCallCeed(ceed, CeedBasisCreateProjection(ceed_data->basis_x, ceed_data->basis_q, &basis_xc)); + PetscCallCeed(ceed, CeedQFunctionCreateInterior(ceed, 1, problem->ics.qfunction, problem->ics.qfunction_loc, &qf_ics)); + PetscCallCeed(ceed, CeedQFunctionSetContext(qf_ics, problem->ics.qfunction_context)); + PetscCallCeed(ceed, CeedQFunctionSetUserFlopsEstimate(qf_ics, 0)); + PetscCallCeed(ceed, CeedQFunctionAddInput(qf_ics, "x", num_comp_x, CEED_EVAL_INTERP)); + PetscCallCeed(ceed, CeedQFunctionAddInput(qf_ics, "dx", num_comp_x * dim, CEED_EVAL_GRAD)); + PetscCallCeed(ceed, CeedQFunctionAddOutput(qf_ics, "q0", num_comp_q, CEED_EVAL_NONE)); + + PetscCallCeed(ceed, CeedOperatorCreate(ceed, qf_ics, NULL, NULL, &op_ics)); + PetscCallCeed(ceed, CeedOperatorSetField(op_ics, "x", ceed_data->elem_restr_x, basis_xc, CEED_VECTOR_ACTIVE)); + PetscCallCeed(ceed, CeedOperatorSetField(op_ics, "dx", ceed_data->elem_restr_x, basis_xc, CEED_VECTOR_ACTIVE)); + PetscCallCeed(ceed, CeedOperatorSetField(op_ics, "q0", ceed_data->elem_restr_q, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE)); + PetscCallCeed(ceed, CeedOperatorGetContextFieldLabel(op_ics, "evaluation time", &user->phys->ics_time_label)); + PetscCall(OperatorApplyContextCreate(NULL, dm, user->ceed, op_ics, ceed_data->x_coord, NULL, NULL, user->Q_loc, &ceed_data->op_ics_ctx)); + + PetscCallCeed(ceed, CeedBasisDestroy(&basis_xc)); + PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_ics)); + PetscCallCeed(ceed, CeedOperatorDestroy(&op_ics)); } - PetscCallCeed(ceed, CeedQFunctionSetUserFlopsEstimate(ceed_data->qf_setup_vol, 0)); - PetscCallCeed(ceed, CeedQFunctionAddInput(ceed_data->qf_setup_vol, "dx", num_comp_x * dim, CEED_EVAL_GRAD)); - PetscCallCeed(ceed, CeedQFunctionAddInput(ceed_data->qf_setup_vol, "weight", 1, CEED_EVAL_WEIGHT)); - PetscCallCeed(ceed, CeedQFunctionAddOutput(ceed_data->qf_setup_vol, "qdata", q_data_size_vol, CEED_EVAL_NONE)); - - // -- Create QFunction for ICs - PetscCallCeed(ceed, CeedQFunctionCreateInterior(ceed, 1, problem->ics.qfunction, problem->ics.qfunction_loc, &ceed_data->qf_ics)); - PetscCallCeed(ceed, CeedQFunctionSetContext(ceed_data->qf_ics, problem->ics.qfunction_context)); - PetscCallCeed(ceed, CeedQFunctionSetUserFlopsEstimate(ceed_data->qf_ics, 0)); - PetscCallCeed(ceed, CeedQFunctionAddInput(ceed_data->qf_ics, "x", num_comp_x, CEED_EVAL_INTERP)); - PetscCallCeed(ceed, CeedQFunctionAddInput(ceed_data->qf_ics, "dx", num_comp_x * dim, CEED_EVAL_GRAD)); - PetscCallCeed(ceed, CeedQFunctionAddOutput(ceed_data->qf_ics, "q0", num_comp_q, CEED_EVAL_NONE)); - - // -- Create QFunction for RHS + if (problem->apply_vol_rhs.qfunction) { - PetscCallCeed( - ceed, CeedQFunctionCreateInterior(ceed, 1, problem->apply_vol_rhs.qfunction, problem->apply_vol_rhs.qfunction_loc, &ceed_data->qf_rhs_vol)); - PetscCallCeed(ceed, CeedQFunctionSetContext(ceed_data->qf_rhs_vol, problem->apply_vol_rhs.qfunction_context)); - PetscCallCeed(ceed, CeedQFunctionSetUserFlopsEstimate(ceed_data->qf_rhs_vol, 0)); - PetscCallCeed(ceed, CeedQFunctionAddInput(ceed_data->qf_rhs_vol, "q", num_comp_q, CEED_EVAL_INTERP)); - PetscCallCeed(ceed, CeedQFunctionAddInput(ceed_data->qf_rhs_vol, "Grad_q", num_comp_q * dim, CEED_EVAL_GRAD)); - PetscCallCeed(ceed, CeedQFunctionAddInput(ceed_data->qf_rhs_vol, "qdata", q_data_size_vol, CEED_EVAL_NONE)); - PetscCallCeed(ceed, CeedQFunctionAddInput(ceed_data->qf_rhs_vol, "x", num_comp_x, CEED_EVAL_INTERP)); - PetscCallCeed(ceed, CeedQFunctionAddOutput(ceed_data->qf_rhs_vol, "v", num_comp_q, CEED_EVAL_INTERP)); - PetscCallCeed(ceed, CeedQFunctionAddOutput(ceed_data->qf_rhs_vol, "Grad_v", num_comp_q * dim, CEED_EVAL_GRAD)); + CeedQFunction qf_rhs_vol; + + PetscCallCeed(ceed, CeedQFunctionCreateInterior(ceed, 1, problem->apply_vol_rhs.qfunction, problem->apply_vol_rhs.qfunction_loc, &qf_rhs_vol)); + PetscCallCeed(ceed, CeedQFunctionSetContext(qf_rhs_vol, problem->apply_vol_rhs.qfunction_context)); + PetscCallCeed(ceed, CeedQFunctionSetUserFlopsEstimate(qf_rhs_vol, 0)); + PetscCallCeed(ceed, CeedQFunctionAddInput(qf_rhs_vol, "q", num_comp_q, CEED_EVAL_INTERP)); + PetscCallCeed(ceed, CeedQFunctionAddInput(qf_rhs_vol, "Grad_q", num_comp_q * dim, CEED_EVAL_GRAD)); + PetscCallCeed(ceed, CeedQFunctionAddInput(qf_rhs_vol, "qdata", problem->q_data_size_vol, CEED_EVAL_NONE)); + PetscCallCeed(ceed, CeedQFunctionAddInput(qf_rhs_vol, "x", num_comp_x, CEED_EVAL_INTERP)); + PetscCallCeed(ceed, CeedQFunctionAddOutput(qf_rhs_vol, "v", num_comp_q, CEED_EVAL_INTERP)); + PetscCallCeed(ceed, CeedQFunctionAddOutput(qf_rhs_vol, "Grad_v", num_comp_q * dim, CEED_EVAL_GRAD)); + + PetscCallCeed(ceed, CeedOperatorCreate(ceed, qf_rhs_vol, NULL, NULL, &op_rhs_vol)); + PetscCallCeed(ceed, CeedOperatorSetField(op_rhs_vol, "q", ceed_data->elem_restr_q, ceed_data->basis_q, CEED_VECTOR_ACTIVE)); + PetscCallCeed(ceed, CeedOperatorSetField(op_rhs_vol, "Grad_q", ceed_data->elem_restr_q, ceed_data->basis_q, CEED_VECTOR_ACTIVE)); + PetscCallCeed(ceed, CeedOperatorSetField(op_rhs_vol, "qdata", ceed_data->elem_restr_qd_i, CEED_BASIS_NONE, ceed_data->q_data)); + PetscCallCeed(ceed, CeedOperatorSetField(op_rhs_vol, "x", ceed_data->elem_restr_x, ceed_data->basis_x, ceed_data->x_coord)); + PetscCallCeed(ceed, CeedOperatorSetField(op_rhs_vol, "v", ceed_data->elem_restr_q, ceed_data->basis_q, CEED_VECTOR_ACTIVE)); + PetscCallCeed(ceed, CeedOperatorSetField(op_rhs_vol, "Grad_v", ceed_data->elem_restr_q, ceed_data->basis_q, CEED_VECTOR_ACTIVE)); + + PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_rhs_vol)); } - // -- Create QFunction for IFunction if (problem->apply_vol_ifunction.qfunction) { + CeedQFunction qf_ifunction_vol; + PetscCallCeed(ceed, CeedQFunctionCreateInterior(ceed, 1, problem->apply_vol_ifunction.qfunction, problem->apply_vol_ifunction.qfunction_loc, - &ceed_data->qf_ifunction_vol)); - PetscCallCeed(ceed, CeedQFunctionSetContext(ceed_data->qf_ifunction_vol, problem->apply_vol_ifunction.qfunction_context)); - PetscCallCeed(ceed, CeedQFunctionSetUserFlopsEstimate(ceed_data->qf_ifunction_vol, 0)); - PetscCallCeed(ceed, CeedQFunctionAddInput(ceed_data->qf_ifunction_vol, "q", num_comp_q, CEED_EVAL_INTERP)); - PetscCallCeed(ceed, CeedQFunctionAddInput(ceed_data->qf_ifunction_vol, "Grad_q", num_comp_q * dim, CEED_EVAL_GRAD)); - PetscCallCeed(ceed, CeedQFunctionAddInput(ceed_data->qf_ifunction_vol, "q dot", num_comp_q, CEED_EVAL_INTERP)); - PetscCallCeed(ceed, CeedQFunctionAddInput(ceed_data->qf_ifunction_vol, "qdata", q_data_size_vol, CEED_EVAL_NONE)); - PetscCallCeed(ceed, CeedQFunctionAddInput(ceed_data->qf_ifunction_vol, "x", num_comp_x, CEED_EVAL_INTERP)); - PetscCallCeed(ceed, CeedQFunctionAddOutput(ceed_data->qf_ifunction_vol, "v", num_comp_q, CEED_EVAL_INTERP)); - PetscCallCeed(ceed, CeedQFunctionAddOutput(ceed_data->qf_ifunction_vol, "Grad_v", num_comp_q * dim, CEED_EVAL_GRAD)); - PetscCallCeed(ceed, CeedQFunctionAddOutput(ceed_data->qf_ifunction_vol, "jac_data", jac_data_size_vol, CEED_EVAL_NONE)); + &qf_ifunction_vol)); + PetscCallCeed(ceed, CeedQFunctionSetContext(qf_ifunction_vol, problem->apply_vol_ifunction.qfunction_context)); + PetscCallCeed(ceed, CeedQFunctionSetUserFlopsEstimate(qf_ifunction_vol, 0)); + PetscCallCeed(ceed, CeedQFunctionAddInput(qf_ifunction_vol, "q", num_comp_q, CEED_EVAL_INTERP)); + PetscCallCeed(ceed, CeedQFunctionAddInput(qf_ifunction_vol, "Grad_q", num_comp_q * dim, CEED_EVAL_GRAD)); + PetscCallCeed(ceed, CeedQFunctionAddInput(qf_ifunction_vol, "q dot", num_comp_q, CEED_EVAL_INTERP)); + PetscCallCeed(ceed, CeedQFunctionAddInput(qf_ifunction_vol, "qdata", problem->q_data_size_vol, CEED_EVAL_NONE)); + PetscCallCeed(ceed, CeedQFunctionAddInput(qf_ifunction_vol, "x", num_comp_x, CEED_EVAL_INTERP)); + PetscCallCeed(ceed, CeedQFunctionAddOutput(qf_ifunction_vol, "v", num_comp_q, CEED_EVAL_INTERP)); + PetscCallCeed(ceed, CeedQFunctionAddOutput(qf_ifunction_vol, "Grad_v", num_comp_q * dim, CEED_EVAL_GRAD)); + PetscCallCeed(ceed, CeedQFunctionAddOutput(qf_ifunction_vol, "jac_data", jac_data_size_vol, CEED_EVAL_NONE)); + + PetscCallCeed(ceed, CeedOperatorCreate(ceed, qf_ifunction_vol, NULL, NULL, &op_ifunction_vol)); + PetscCallCeed(ceed, CeedOperatorSetField(op_ifunction_vol, "q", ceed_data->elem_restr_q, ceed_data->basis_q, CEED_VECTOR_ACTIVE)); + PetscCallCeed(ceed, CeedOperatorSetField(op_ifunction_vol, "Grad_q", ceed_data->elem_restr_q, ceed_data->basis_q, CEED_VECTOR_ACTIVE)); + PetscCallCeed(ceed, CeedOperatorSetField(op_ifunction_vol, "q dot", ceed_data->elem_restr_q, ceed_data->basis_q, user->q_dot_ceed)); + PetscCallCeed(ceed, CeedOperatorSetField(op_ifunction_vol, "qdata", ceed_data->elem_restr_qd_i, CEED_BASIS_NONE, ceed_data->q_data)); + PetscCallCeed(ceed, CeedOperatorSetField(op_ifunction_vol, "x", ceed_data->elem_restr_x, ceed_data->basis_x, ceed_data->x_coord)); + PetscCallCeed(ceed, CeedOperatorSetField(op_ifunction_vol, "v", ceed_data->elem_restr_q, ceed_data->basis_q, CEED_VECTOR_ACTIVE)); + PetscCallCeed(ceed, CeedOperatorSetField(op_ifunction_vol, "Grad_v", ceed_data->elem_restr_q, ceed_data->basis_q, CEED_VECTOR_ACTIVE)); + PetscCallCeed(ceed, CeedOperatorSetField(op_ifunction_vol, "jac_data", elem_restr_jd_i, CEED_BASIS_NONE, jac_data)); + + PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_ifunction_vol)); } - CeedQFunction qf_ijacobian_vol = NULL; if (problem->apply_vol_ijacobian.qfunction) { + CeedQFunction qf_ijacobian_vol; + PetscCallCeed(ceed, CeedQFunctionCreateInterior(ceed, 1, problem->apply_vol_ijacobian.qfunction, problem->apply_vol_ijacobian.qfunction_loc, &qf_ijacobian_vol)); PetscCallCeed(ceed, CeedQFunctionSetContext(qf_ijacobian_vol, problem->apply_vol_ijacobian.qfunction_context)); PetscCallCeed(ceed, CeedQFunctionSetUserFlopsEstimate(qf_ijacobian_vol, 0)); PetscCallCeed(ceed, CeedQFunctionAddInput(qf_ijacobian_vol, "dq", num_comp_q, CEED_EVAL_INTERP)); PetscCallCeed(ceed, CeedQFunctionAddInput(qf_ijacobian_vol, "Grad_dq", num_comp_q * dim, CEED_EVAL_GRAD)); - PetscCallCeed(ceed, CeedQFunctionAddInput(qf_ijacobian_vol, "qdata", q_data_size_vol, CEED_EVAL_NONE)); + PetscCallCeed(ceed, CeedQFunctionAddInput(qf_ijacobian_vol, "qdata", problem->q_data_size_vol, CEED_EVAL_NONE)); PetscCallCeed(ceed, CeedQFunctionAddInput(qf_ijacobian_vol, "jac_data", jac_data_size_vol, CEED_EVAL_NONE)); PetscCallCeed(ceed, CeedQFunctionAddOutput(qf_ijacobian_vol, "v", num_comp_q, CEED_EVAL_INTERP)); PetscCallCeed(ceed, CeedQFunctionAddOutput(qf_ijacobian_vol, "Grad_v", num_comp_q * dim, CEED_EVAL_GRAD)); - } - - // --------------------------------------------------------------------------- - // Element coordinates - // --------------------------------------------------------------------------- - // -- Create CEED vector - PetscCallCeed(ceed, CeedElemRestrictionCreateVector(ceed_data->elem_restr_x, &ceed_data->x_coord, NULL)); - - // -- Copy PETSc vector in CEED vector - Vec X_loc; - { - DM cdm; - PetscCall(DMGetCellCoordinateDM(dm, &cdm)); - if (cdm) { - PetscCall(DMGetCellCoordinatesLocal(dm, &X_loc)); - } else { - PetscCall(DMGetCoordinatesLocal(dm, &X_loc)); - } - } - PetscCall(VecScale(X_loc, problem->dm_scale)); - PetscCall(VecCopyPetscToCeed(X_loc, ceed_data->x_coord)); - - // ----------------------------------------------------------------------------- - // CEED vectors - // ----------------------------------------------------------------------------- - // -- Create CEED vector for geometric data - PetscCallCeed(ceed, CeedElemRestrictionCreateVector(ceed_data->elem_restr_qd_i, &ceed_data->q_data, NULL)); - PetscCallCeed(ceed, CeedElemRestrictionCreateVector(elem_restr_jd_i, &jac_data, NULL)); - - // ----------------------------------------------------------------------------- - // CEED Operators - // ----------------------------------------------------------------------------- - // -- Create CEED operator for quadrature data - PetscCallCeed(ceed, CeedOperatorCreate(ceed, ceed_data->qf_setup_vol, NULL, NULL, &ceed_data->op_setup_vol)); - PetscCallCeed(ceed, CeedOperatorSetField(ceed_data->op_setup_vol, "dx", ceed_data->elem_restr_x, ceed_data->basis_x, CEED_VECTOR_ACTIVE)); - PetscCallCeed(ceed, CeedOperatorSetField(ceed_data->op_setup_vol, "weight", CEED_ELEMRESTRICTION_NONE, ceed_data->basis_x, CEED_VECTOR_NONE)); - PetscCallCeed(ceed, CeedOperatorSetField(ceed_data->op_setup_vol, "qdata", ceed_data->elem_restr_qd_i, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE)); - - // -- Create CEED operator for ICs - CeedOperator op_ics; - PetscCallCeed(ceed, CeedOperatorCreate(ceed, ceed_data->qf_ics, NULL, NULL, &op_ics)); - PetscCallCeed(ceed, CeedOperatorSetField(op_ics, "x", ceed_data->elem_restr_x, ceed_data->basis_xc, CEED_VECTOR_ACTIVE)); - PetscCallCeed(ceed, CeedOperatorSetField(op_ics, "dx", ceed_data->elem_restr_x, ceed_data->basis_xc, CEED_VECTOR_ACTIVE)); - PetscCallCeed(ceed, CeedOperatorSetField(op_ics, "q0", ceed_data->elem_restr_q, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE)); - PetscCallCeed(ceed, CeedOperatorGetContextFieldLabel(op_ics, "evaluation time", &user->phys->ics_time_label)); - PetscCall(OperatorApplyContextCreate(NULL, dm, user->ceed, op_ics, ceed_data->x_coord, NULL, NULL, user->Q_loc, &ceed_data->op_ics_ctx)); - PetscCallCeed(ceed, CeedOperatorDestroy(&op_ics)); - - // Create CEED operator for RHS - if (ceed_data->qf_rhs_vol) { - CeedOperator op; - PetscCallCeed(ceed, CeedOperatorCreate(ceed, ceed_data->qf_rhs_vol, NULL, NULL, &op)); - PetscCallCeed(ceed, CeedOperatorSetField(op, "q", ceed_data->elem_restr_q, ceed_data->basis_q, CEED_VECTOR_ACTIVE)); - PetscCallCeed(ceed, CeedOperatorSetField(op, "Grad_q", ceed_data->elem_restr_q, ceed_data->basis_q, CEED_VECTOR_ACTIVE)); - PetscCallCeed(ceed, CeedOperatorSetField(op, "qdata", ceed_data->elem_restr_qd_i, CEED_BASIS_NONE, ceed_data->q_data)); - PetscCallCeed(ceed, CeedOperatorSetField(op, "x", ceed_data->elem_restr_x, ceed_data->basis_x, ceed_data->x_coord)); - PetscCallCeed(ceed, CeedOperatorSetField(op, "v", ceed_data->elem_restr_q, ceed_data->basis_q, CEED_VECTOR_ACTIVE)); - PetscCallCeed(ceed, CeedOperatorSetField(op, "Grad_v", ceed_data->elem_restr_q, ceed_data->basis_q, CEED_VECTOR_ACTIVE)); - user->op_rhs_vol = op; - } - // -- CEED operator for IFunction - if (ceed_data->qf_ifunction_vol) { - CeedOperator op; - PetscCallCeed(ceed, CeedOperatorCreate(ceed, ceed_data->qf_ifunction_vol, NULL, NULL, &op)); - PetscCallCeed(ceed, CeedOperatorSetField(op, "q", ceed_data->elem_restr_q, ceed_data->basis_q, CEED_VECTOR_ACTIVE)); - PetscCallCeed(ceed, CeedOperatorSetField(op, "Grad_q", ceed_data->elem_restr_q, ceed_data->basis_q, CEED_VECTOR_ACTIVE)); - PetscCallCeed(ceed, CeedOperatorSetField(op, "q dot", ceed_data->elem_restr_q, ceed_data->basis_q, user->q_dot_ceed)); - PetscCallCeed(ceed, CeedOperatorSetField(op, "qdata", ceed_data->elem_restr_qd_i, CEED_BASIS_NONE, ceed_data->q_data)); - PetscCallCeed(ceed, CeedOperatorSetField(op, "x", ceed_data->elem_restr_x, ceed_data->basis_x, ceed_data->x_coord)); - PetscCallCeed(ceed, CeedOperatorSetField(op, "v", ceed_data->elem_restr_q, ceed_data->basis_q, CEED_VECTOR_ACTIVE)); - PetscCallCeed(ceed, CeedOperatorSetField(op, "Grad_v", ceed_data->elem_restr_q, ceed_data->basis_q, CEED_VECTOR_ACTIVE)); - PetscCallCeed(ceed, CeedOperatorSetField(op, "jac_data", elem_restr_jd_i, CEED_BASIS_NONE, jac_data)); - - user->op_ifunction_vol = op; - } + PetscCallCeed(ceed, CeedOperatorCreate(ceed, qf_ijacobian_vol, NULL, NULL, &op_ijacobian_vol)); + PetscCallCeed(ceed, CeedOperatorSetField(op_ijacobian_vol, "dq", ceed_data->elem_restr_q, ceed_data->basis_q, CEED_VECTOR_ACTIVE)); + PetscCallCeed(ceed, CeedOperatorSetField(op_ijacobian_vol, "Grad_dq", ceed_data->elem_restr_q, ceed_data->basis_q, CEED_VECTOR_ACTIVE)); + PetscCallCeed(ceed, CeedOperatorSetField(op_ijacobian_vol, "qdata", ceed_data->elem_restr_qd_i, CEED_BASIS_NONE, ceed_data->q_data)); + PetscCallCeed(ceed, CeedOperatorSetField(op_ijacobian_vol, "jac_data", elem_restr_jd_i, CEED_BASIS_NONE, jac_data)); + PetscCallCeed(ceed, CeedOperatorSetField(op_ijacobian_vol, "v", ceed_data->elem_restr_q, ceed_data->basis_q, CEED_VECTOR_ACTIVE)); + PetscCallCeed(ceed, CeedOperatorSetField(op_ijacobian_vol, "Grad_v", ceed_data->elem_restr_q, ceed_data->basis_q, CEED_VECTOR_ACTIVE)); - CeedOperator op_ijacobian_vol = NULL; - if (qf_ijacobian_vol) { - CeedOperator op; - PetscCallCeed(ceed, CeedOperatorCreate(ceed, qf_ijacobian_vol, NULL, NULL, &op)); - PetscCallCeed(ceed, CeedOperatorSetField(op, "dq", ceed_data->elem_restr_q, ceed_data->basis_q, CEED_VECTOR_ACTIVE)); - PetscCallCeed(ceed, CeedOperatorSetField(op, "Grad_dq", ceed_data->elem_restr_q, ceed_data->basis_q, CEED_VECTOR_ACTIVE)); - PetscCallCeed(ceed, CeedOperatorSetField(op, "qdata", ceed_data->elem_restr_qd_i, CEED_BASIS_NONE, ceed_data->q_data)); - PetscCallCeed(ceed, CeedOperatorSetField(op, "jac_data", elem_restr_jd_i, CEED_BASIS_NONE, jac_data)); - PetscCallCeed(ceed, CeedOperatorSetField(op, "v", ceed_data->elem_restr_q, ceed_data->basis_q, CEED_VECTOR_ACTIVE)); - PetscCallCeed(ceed, CeedOperatorSetField(op, "Grad_v", ceed_data->elem_restr_q, ceed_data->basis_q, CEED_VECTOR_ACTIVE)); - op_ijacobian_vol = op; PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_ijacobian_vol)); } - // ***************************************************************************** - // Set up CEED objects for the exterior domain (surface) - // ***************************************************************************** - height = 1; - CeedInt dim_sur = dim - height, P_sur = app_ctx->degree + 1, Q_sur = P_sur + app_ctx->q_extra; - const CeedInt q_data_size_sur = problem->q_data_size_sur, jac_data_size_sur = user->phys->implicit ? problem->jac_data_size_sur : 0; - - // ----------------------------------------------------------------------------- - // CEED Bases - // ----------------------------------------------------------------------------- - - DMLabel label = 0; - PetscInt face_id = 0; - PetscInt field = 0; // Still want the normal, default field - PetscCall(CreateBasisFromPlex(ceed, dm, label, face_id, height, field, &ceed_data->basis_q_sur)); - PetscCall(CreateBasisFromPlex(ceed, dm_coord, label, face_id, height, field, &ceed_data->basis_x_sur)); - - // ----------------------------------------------------------------------------- - // CEED QFunctions - // ----------------------------------------------------------------------------- - // -- Create QFunction for quadrature data - PetscCallCeed(ceed, CeedQFunctionCreateInterior(ceed, 1, problem->setup_sur.qfunction, problem->setup_sur.qfunction_loc, &ceed_data->qf_setup_sur)); - if (problem->setup_sur.qfunction_context) { - PetscCallCeed(ceed, CeedQFunctionSetContext(ceed_data->qf_setup_sur, problem->setup_sur.qfunction_context)); - } - PetscCallCeed(ceed, CeedQFunctionSetUserFlopsEstimate(ceed_data->qf_setup_sur, 0)); - PetscCallCeed(ceed, CeedQFunctionAddInput(ceed_data->qf_setup_sur, "dx", num_comp_x * dim_sur, CEED_EVAL_GRAD)); - PetscCallCeed(ceed, CeedQFunctionAddInput(ceed_data->qf_setup_sur, "weight", 1, CEED_EVAL_WEIGHT)); - PetscCallCeed(ceed, CeedQFunctionAddOutput(ceed_data->qf_setup_sur, "surface qdata", q_data_size_sur, CEED_EVAL_NONE)); - - PetscCall(SetupBCQFunctions(ceed, dim_sur, num_comp_x, num_comp_q, q_data_size_sur, jac_data_size_sur, problem->apply_inflow, - problem->apply_inflow_jacobian, &ceed_data->qf_apply_inflow, &ceed_data->qf_apply_inflow_jacobian)); - PetscCall(SetupBCQFunctions(ceed, dim_sur, num_comp_x, num_comp_q, q_data_size_sur, jac_data_size_sur, problem->apply_outflow, - problem->apply_outflow_jacobian, &ceed_data->qf_apply_outflow, &ceed_data->qf_apply_outflow_jacobian)); - PetscCall(SetupBCQFunctions(ceed, dim_sur, num_comp_x, num_comp_q, q_data_size_sur, jac_data_size_sur, problem->apply_freestream, - problem->apply_freestream_jacobian, &ceed_data->qf_apply_freestream, &ceed_data->qf_apply_freestream_jacobian)); - PetscCall(SetupBCQFunctions(ceed, dim_sur, num_comp_x, num_comp_q, q_data_size_sur, jac_data_size_sur, problem->apply_slip, - problem->apply_slip_jacobian, &ceed_data->qf_apply_slip, &ceed_data->qf_apply_slip_jacobian)); - - // ***************************************************************************** - // CEED Operator Apply - // ***************************************************************************** - // -- Apply CEED Operator for the geometric data - PetscCallCeed(ceed, CeedOperatorApply(ceed_data->op_setup_vol, ceed_data->x_coord, ceed_data->q_data, CEED_REQUEST_IMMEDIATE)); - // -- Create and apply CEED Composite Operator for the entire domain if (!user->phys->implicit) { // RHS CeedOperator op_rhs; - PetscCall(CreateOperatorForDomain(ceed, dm, bc, ceed_data, user->phys, user->op_rhs_vol, NULL, height, P_sur, Q_sur, q_data_size_sur, 0, &op_rhs, - NULL)); + + PetscCallCeed(ceed, CeedOperatorCreateComposite(ceed, &op_rhs)); + PetscCallCeed(ceed, CeedOperatorCompositeAddSub(op_rhs, op_rhs_vol)); + PetscCall(AddBCSubOperators(user, ceed, dm, bc, problem, ceed_data, op_rhs, NULL)); + PetscCall(OperatorApplyContextCreate(dm, dm, ceed, op_rhs, user->q_ceed, user->g_ceed, user->Q_loc, NULL, &user->op_rhs_ctx)); + + // ----- Get Context Labels for Operator + PetscCallCeed(ceed, CeedOperatorGetContextFieldLabel(op_rhs, "solution time", &user->phys->solution_time_label)); + PetscCallCeed(ceed, CeedOperatorGetContextFieldLabel(op_rhs, "timestep size", &user->phys->timestep_size_label)); + PetscCallCeed(ceed, CeedOperatorDestroy(&op_rhs)); PetscCall(CreateKSPMass(user, problem)); - PetscCheck(app_ctx->sgs_model_type == SGS_MODEL_NONE, user->comm, PETSC_ERR_SUP, "SGS modeling not implemented for explicit timestepping"); } else { // IFunction CeedOperator op_ijacobian = NULL; - PetscCall(CreateOperatorForDomain(ceed, dm, bc, ceed_data, user->phys, user->op_ifunction_vol, op_ijacobian_vol, height, P_sur, Q_sur, - q_data_size_sur, jac_data_size_sur, &user->op_ifunction, op_ijacobian_vol ? &op_ijacobian : NULL)); + // Create Composite Operaters + PetscCallCeed(ceed, CeedOperatorCreateComposite(ceed, &user->op_ifunction)); + PetscCallCeed(ceed, CeedOperatorCompositeAddSub(user->op_ifunction, op_ifunction_vol)); + if (op_ijacobian_vol) { + PetscCallCeed(ceed, CeedOperatorCreateComposite(ceed, &op_ijacobian)); + PetscCallCeed(ceed, CeedOperatorCompositeAddSub(op_ijacobian, op_ijacobian_vol)); + } + PetscCall(AddBCSubOperators(user, ceed, dm, bc, problem, ceed_data, user->op_ifunction, op_ijacobian)); + + // ----- Get Context Labels for Operator + PetscCallCeed(ceed, CeedOperatorGetContextFieldLabel(user->op_ifunction, "solution time", &user->phys->solution_time_label)); + PetscCallCeed(ceed, CeedOperatorGetContextFieldLabel(user->op_ifunction, "timestep size", &user->phys->timestep_size_label)); + if (op_ijacobian) { - PetscCall(MatCeedCreate(user->dm, user->dm, op_ijacobian, NULL, &user->mat_ijacobian)); + PetscCall(MatCreateCeed(user->dm, user->dm, op_ijacobian, NULL, &user->mat_ijacobian)); PetscCall(MatCeedSetLocalVectors(user->mat_ijacobian, user->Q_dot_loc, NULL)); - PetscCallCeed(ceed, CeedOperatorGetContextFieldLabel(op_ijacobian, "ijacobian time shift", &user->phys->ijacobian_time_shift_label)); PetscCallCeed(ceed, CeedOperatorDestroy(&op_ijacobian)); } - if (app_ctx->sgs_model_type == SGS_MODEL_DATA_DRIVEN) PetscCall(SgsDDSetup(ceed, user, ceed_data, problem)); } if (problem->use_strong_bc_ceed) PetscCall(SetupStrongBC_Ceed(ceed, ceed_data, dm, user, problem, bc)); if (app_ctx->turb_spanstats_enable) PetscCall(TurbulenceStatisticsSetup(ceed, user, ceed_data, problem)); if (app_ctx->diff_filter_monitor && !user->diff_filter) PetscCall(DifferentialFilterSetup(ceed, user, ceed_data, problem)); - if (app_ctx->sgs_train_enable) PetscCall(SGS_DD_TrainingSetup(ceed, user, ceed_data, problem)); + PetscCallCeed(ceed, CeedVectorDestroy(&jac_data)); PetscCallCeed(ceed, CeedElemRestrictionDestroy(&elem_restr_jd_i)); PetscCallCeed(ceed, CeedOperatorDestroy(&op_ijacobian_vol)); - PetscCallCeed(ceed, CeedVectorDestroy(&jac_data)); + PetscCallCeed(ceed, CeedOperatorDestroy(&op_ifunction_vol)); + PetscCallCeed(ceed, CeedOperatorDestroy(&op_rhs_vol)); PetscFunctionReturn(PETSC_SUCCESS); } diff --git a/examples/fluids/src/setupts.c b/examples/fluids/src/setupts.c index 2ef3de7203..6ebaa66b39 100644 --- a/examples/fluids/src/setupts.c +++ b/examples/fluids/src/setupts.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -140,10 +140,6 @@ PetscErrorCode IFunction_NS(TS ts, PetscReal t, Vec Q, Vec Q_dot, Vec G, void *u PetscCall(VecReadCeedToPetsc(user->q_dot_ceed, q_dot_mem_type, Q_dot_loc)); PetscCall(VecCeedToPetsc(user->g_ceed, g_mem_type, G_loc)); - if (user->app_ctx->sgs_model_type == SGS_MODEL_DATA_DRIVEN) { - PetscCall(SgsDDApplyIFunction(user, Q_loc, G_loc)); - } - // Local-to-Global PetscCall(VecZeroEntries(G)); PetscCall(DMLocalToGlobal(user->dm, G_loc, ADD_VALUES, G)); @@ -155,7 +151,6 @@ PetscErrorCode IFunction_NS(TS ts, PetscReal t, Vec Q, Vec Q_dot, Vec G, void *u PetscErrorCode FormIJacobian_NS(TS ts, PetscReal t, Vec Q, Vec Q_dot, PetscReal shift, Mat J, Mat J_pre, void *user_data) { User user = *(User *)user_data; - Ceed ceed = user->ceed; PetscBool J_is_matceed, J_is_mffd, J_pre_is_matceed, J_pre_is_mffd; PetscFunctionBeginUser; @@ -163,12 +158,8 @@ PetscErrorCode FormIJacobian_NS(TS ts, PetscReal t, Vec Q, Vec Q_dot, PetscReal PetscCall(PetscObjectTypeCompare((PetscObject)J, MATCEED, &J_is_matceed)); PetscCall(PetscObjectTypeCompare((PetscObject)J_pre, MATMFFD, &J_pre_is_mffd)); PetscCall(PetscObjectTypeCompare((PetscObject)J_pre, MATCEED, &J_pre_is_matceed)); - if (user->phys->ijacobian_time_shift_label) { - CeedOperator op_ijacobian; - PetscCall(MatCeedGetCeedOperators(user->mat_ijacobian, &op_ijacobian, NULL)); - PetscCallCeed(ceed, CeedOperatorSetContextDouble(op_ijacobian, user->phys->ijacobian_time_shift_label, &shift)); - } + PetscCall(MatCeedSetContextReal(user->mat_ijacobian, "ijacobian time shift", shift)); if (J_is_matceed || J_is_mffd) { PetscCall(MatAssemblyBegin(J, MAT_FINAL_ASSEMBLY)); @@ -216,8 +207,8 @@ PetscErrorCode WriteOutput(User user, Vec Q, PetscInt step_no, PetscScalar time) PetscCall(VecZeroEntries(Q_refined_loc)); PetscCall(DMGlobalToLocal(user->dm_viz, Q_refined, INSERT_VALUES, Q_refined_loc)); - PetscCall( - PetscSNPrintf(file_path_refined, sizeof file_path_refined, "%s/nsrefined-%03" PetscInt_FMT ".vtu", user->app_ctx->output_dir, step_no)); + PetscCall(PetscSNPrintf(file_path_refined, sizeof file_path_refined, "%s/nsrefined-%03" PetscInt_FMT ".vtu", user->app_ctx->output_dir, + step_no)); PetscCall(PetscViewerVTKOpen(PetscObjectComm((PetscObject)Q_refined), file_path_refined, FILE_MODE_WRITE, &viewer_refined)); PetscCall(VecView(Q_refined_loc, viewer_refined)); @@ -303,7 +294,7 @@ PetscErrorCode TSMonitor_NS(TS ts, PetscInt step_no, PetscReal time, Vec Q, void } // TS: Create, setup, and solve -PetscErrorCode TSSolve_NS(DM dm, User user, AppCtx app_ctx, Physics phys, Vec *Q, PetscScalar *f_time, TS *ts) { +PetscErrorCode TSSolve_NS(DM dm, User user, AppCtx app_ctx, Physics phys, ProblemData problem, Vec *Q, PetscScalar *f_time, TS *ts) { MPI_Comm comm = user->comm; TSAdapt adapt; PetscScalar final_time; @@ -378,10 +369,7 @@ PetscErrorCode TSSolve_NS(DM dm, User user, AppCtx app_ctx, Physics phys, Vec *Q } if (app_ctx->diff_filter_monitor) PetscCall(TSMonitorSet(*ts, TSMonitor_DifferentialFilter, user, NULL)); - if (app_ctx->sgs_train_enable) { - PetscCall(TSMonitorSet(*ts, TSMonitor_SGS_DD_Training, user, NULL)); - PetscCall(TSSetPostStep(*ts, TSPostStep_SGS_DD_Training)); - } + if (app_ctx->test_type == TESTTYPE_NONE) PetscCall(PrintRunInfo(user, user->phys, problem, *ts)); // Solve PetscReal start_time; PetscInt start_step; diff --git a/examples/fluids/src/smartsim/sgs_dd_training.c b/examples/fluids/src/smartsim/sgs_dd_training.c deleted file mode 100644 index c3ff2ac43b..0000000000 --- a/examples/fluids/src/smartsim/sgs_dd_training.c +++ /dev/null @@ -1,390 +0,0 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. -// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. -// -// SPDX-License-Identifier: BSD-2-Clause -// -// This file is part of CEED: http://github.com/ceed - -#include "../../qfunctions/sgs_dd_training.h" - -#include - -#include "../../include/smartsim.h" -#include "../../navierstokes.h" - -typedef struct { - CeedElemRestriction elem_restr_grid_aniso; - CeedVector grid_aniso_ceed; - CeedQFunctionContext sgs_dd_train_qfctx; -} *SGS_DD_TrainingSetupData; - -static PetscErrorCode SGS_DD_TrainingSetupDataDestroy(SGS_DD_TrainingSetupData sgs_dd_train_setup_data) { - Ceed ceed; - - PetscFunctionBeginUser; - PetscCall(CeedElemRestrictionGetCeed(sgs_dd_train_setup_data->elem_restr_grid_aniso, &ceed)); - - PetscCallCeed(ceed, CeedElemRestrictionDestroy(&sgs_dd_train_setup_data->elem_restr_grid_aniso)); - PetscCallCeed(ceed, CeedVectorDestroy(&sgs_dd_train_setup_data->grid_aniso_ceed)); - PetscCallCeed(ceed, CeedQFunctionContextDestroy(&sgs_dd_train_setup_data->sgs_dd_train_qfctx)); - PetscCall(PetscFree(sgs_dd_train_setup_data)); - PetscFunctionReturn(PETSC_SUCCESS); -} - -// @brief Create DM for storing data-drive SGS model inputs -static PetscErrorCode SGS_DD_TrainingCreateDM(DM dm_source, DM *dm_dd_training, PetscInt degree, PetscInt q_extra, PetscInt *num_components) { - PetscSection section; - - PetscFunctionBeginUser; - *num_components = 12; - - PetscCall(DMClone(dm_source, dm_dd_training)); - PetscCall(PetscObjectSetName((PetscObject)*dm_dd_training, "Data-Driven SGS Training Data")); - - PetscCall(DMSetupByOrder_FEM(PETSC_TRUE, PETSC_TRUE, degree, 1, q_extra, 1, num_components, *dm_dd_training)); - - PetscCall(DMGetLocalSection(*dm_dd_training, §ion)); - PetscCall(PetscSectionSetFieldName(section, 0, "Data-Driven SGS Training Data")); - PetscCall(PetscSectionSetComponentName(section, 0, 0, "SGSInput1")); - PetscCall(PetscSectionSetComponentName(section, 0, 1, "SGSInput2")); - PetscCall(PetscSectionSetComponentName(section, 0, 2, "SGSInput3")); - PetscCall(PetscSectionSetComponentName(section, 0, 3, "SGSInput4")); - PetscCall(PetscSectionSetComponentName(section, 0, 4, "SGSInput5")); - PetscCall(PetscSectionSetComponentName(section, 0, 5, "SGSInput6")); - PetscCall(PetscSectionSetComponentName(section, 0, 6, "FilteredSGSXX")); - PetscCall(PetscSectionSetComponentName(section, 0, 7, "FilteredSGSYY")); - PetscCall(PetscSectionSetComponentName(section, 0, 8, "FilteredSGSZZ")); - PetscCall(PetscSectionSetComponentName(section, 0, 9, "FilteredSGSYZ")); - PetscCall(PetscSectionSetComponentName(section, 0, 10, "FilteredSGSXZ")); - PetscCall(PetscSectionSetComponentName(section, 0, 11, "FilteredSGSXY")); - PetscFunctionReturn(PETSC_SUCCESS); -}; - -// @brief Create CeedOperator to calculate training data for data-drive SGS model at nodes -static PetscErrorCode SetupTrainingDataCalculation(Ceed ceed, User user, CeedData ceed_data, ProblemData problem, - SGS_DD_TrainingSetupData sgs_dd_train_setup_data) { - SGS_DD_TrainingData sgs_dd_train = user->sgs_dd_train; - CeedQFunction qf_sgs_dd_train; - CeedOperator op_sgs_dd_train; - CeedInt num_comp_grad_velo, num_comp_grid_aniso; - CeedVector inv_multiplicity, filtered_fields; - CeedElemRestriction elem_restr_inv_multiplicity, elem_restr_grad_velo, elem_restr_sgs_train; - DMLabel domain_label = NULL; - PetscInt label_value = 0, height = 0, dm_field = 0; - - PetscFunctionBeginUser; - PetscCallCeed(ceed, CeedElemRestrictionGetNumComponents(sgs_dd_train_setup_data->elem_restr_grid_aniso, &num_comp_grid_aniso)); - - PetscCall(DMPlexCeedElemRestrictionCreate(ceed, sgs_dd_train->dm_dd_training, domain_label, label_value, height, dm_field, &elem_restr_sgs_train)); - PetscCall(GetInverseMultiplicity(ceed, sgs_dd_train->dm_dd_training, domain_label, label_value, height, dm_field, PETSC_TRUE, - &elem_restr_inv_multiplicity, &inv_multiplicity)); - - CeedElemRestriction elem_restr_filtered_state; - CeedInt num_comp_filtered_state; - { // -- Setup filtered velocity gradient projection - CeedBasis basis_filtered_state; - CeedOperatorField op_field; - PetscCallCeed(ceed, CeedOperatorGetFieldByName(user->diff_filter->op_rhs_ctx->op, "v0", &op_field)); - PetscCallCeed(ceed, CeedOperatorFieldGetElemRestriction(op_field, &elem_restr_filtered_state)); - PetscCallCeed(ceed, CeedElemRestrictionGetNumComponents(elem_restr_filtered_state, &num_comp_filtered_state)); - PetscCallCeed(ceed, CeedOperatorFieldGetBasis(op_field, &basis_filtered_state)); - PetscCall(VelocityGradientProjectionSetup(ceed, user, ceed_data, problem, STATEVAR_PRIMITIVE, elem_restr_filtered_state, basis_filtered_state, - &sgs_dd_train->filtered_grad_velo_proj)); - // Get velocity gradient information - PetscCallCeed(ceed, CeedOperatorGetFieldByName(sgs_dd_train->filtered_grad_velo_proj->l2_rhs_ctx->op, "velocity gradient", &op_field)); - PetscCallCeed(ceed, CeedOperatorFieldGetElemRestriction(op_field, &elem_restr_grad_velo)); - PetscCallCeed(ceed, CeedElemRestrictionGetNumComponents(elem_restr_grad_velo, &num_comp_grad_velo)); - } - - CeedElemRestriction elem_restr_filtered_velo_prod; - CeedInt num_comp_filtered_velo_prod; - { // Get filtered velocity product information - CeedOperatorField op_field; - PetscCallCeed(ceed, CeedOperatorGetFieldByName(user->diff_filter->op_rhs_ctx->op, "v1", &op_field)); - PetscCallCeed(ceed, CeedOperatorFieldGetElemRestriction(op_field, &elem_restr_filtered_velo_prod)); - PetscCallCeed(ceed, CeedElemRestrictionGetNumComponents(elem_restr_filtered_velo_prod, &num_comp_filtered_velo_prod)); - } - - // -- Create operator for generating training data at nodes - // Differential Filter only provides filtered primitive variables - PetscCallCeed(ceed, CeedQFunctionCreateInterior(ceed, 1, ComputeSGS_DDAnisotropicTrainingDataNodal_Prim, - ComputeSGS_DDAnisotropicTrainingDataNodal_Prim_loc, &qf_sgs_dd_train)); - - PetscCallCeed(ceed, CeedQFunctionSetContext(qf_sgs_dd_train, sgs_dd_train_setup_data->sgs_dd_train_qfctx)); - PetscCallCeed(ceed, CeedQFunctionAddInput(qf_sgs_dd_train, "q", num_comp_filtered_state, CEED_EVAL_NONE)); - PetscCallCeed(ceed, CeedQFunctionAddInput(qf_sgs_dd_train, "velocity product", num_comp_filtered_velo_prod, CEED_EVAL_NONE)); - PetscCallCeed(ceed, CeedQFunctionAddInput(qf_sgs_dd_train, "gradient velocity", num_comp_grad_velo, CEED_EVAL_NONE)); - PetscCallCeed(ceed, CeedQFunctionAddInput(qf_sgs_dd_train, "anisotropy tensor", num_comp_grid_aniso, CEED_EVAL_NONE)); - PetscCallCeed(ceed, CeedQFunctionAddInput(qf_sgs_dd_train, "inverse multiplicity", 1, CEED_EVAL_NONE)); - PetscCallCeed(ceed, CeedQFunctionAddOutput(qf_sgs_dd_train, "training data", sgs_dd_train->num_comp_dd_inputs, CEED_EVAL_NONE)); - - PetscCallCeed(ceed, CeedElemRestrictionCreateVector(elem_restr_filtered_state, &filtered_fields, NULL)); - PetscCallCeed(ceed, CeedOperatorCreate(ceed, qf_sgs_dd_train, NULL, NULL, &op_sgs_dd_train)); - PetscCallCeed(ceed, CeedOperatorSetField(op_sgs_dd_train, "q", elem_restr_filtered_state, CEED_BASIS_NONE, filtered_fields)); - PetscCallCeed(ceed, CeedOperatorSetField(op_sgs_dd_train, "velocity product", elem_restr_filtered_velo_prod, CEED_BASIS_NONE, filtered_fields)); - PetscCallCeed(ceed, CeedOperatorSetField(op_sgs_dd_train, "gradient velocity", elem_restr_grad_velo, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE)); - PetscCallCeed(ceed, CeedOperatorSetField(op_sgs_dd_train, "anisotropy tensor", sgs_dd_train_setup_data->elem_restr_grid_aniso, CEED_BASIS_NONE, - sgs_dd_train_setup_data->grid_aniso_ceed)); - PetscCallCeed(ceed, CeedOperatorSetField(op_sgs_dd_train, "inverse multiplicity", elem_restr_inv_multiplicity, CEED_BASIS_NONE, inv_multiplicity)); - PetscCallCeed(ceed, CeedOperatorSetField(op_sgs_dd_train, "training data", elem_restr_sgs_train, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE)); - - PetscCall(OperatorApplyContextCreate(sgs_dd_train->filtered_grad_velo_proj->dm, sgs_dd_train->dm_dd_training, ceed, op_sgs_dd_train, NULL, NULL, - NULL, NULL, &sgs_dd_train->op_training_data_calc_ctx)); - - PetscCallCeed(ceed, CeedVectorDestroy(&inv_multiplicity)); - PetscCallCeed(ceed, CeedVectorDestroy(&filtered_fields)); - PetscCallCeed(ceed, CeedElemRestrictionDestroy(&elem_restr_inv_multiplicity)); - PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_sgs_dd_train)); - PetscCallCeed(ceed, CeedOperatorDestroy(&op_sgs_dd_train)); - PetscFunctionReturn(PETSC_SUCCESS); -} - -PetscErrorCode SGS_DD_TrainingSetup(Ceed ceed, User user, CeedData ceed_data, ProblemData problem) { - SGS_DDTrainingContext sgsdd_train_qfctx; - SGS_DD_TrainingSetupData sgs_dd_train_setup_data; - - PetscFunctionBeginUser; - if (!user->diff_filter) PetscCall(DifferentialFilterSetup(ceed, user, ceed_data, problem)); - if (!user->smartsim) PetscCall(SmartSimSetup(user)); - - PetscCall(PetscNew(&sgsdd_train_qfctx)); - PetscCall(PetscNew(&sgs_dd_train_setup_data)); - PetscCall(PetscNew(&user->sgs_dd_train)); - SGS_DD_TrainingData sgs_dd_train = user->sgs_dd_train; - - sgs_dd_train->overwrite_training_data = PETSC_TRUE; - sgs_dd_train->write_data_interval = 1; - sgs_dd_train->num_filter_widths = sizeof(sgs_dd_train->filter_widths) / sizeof(sgs_dd_train->filter_widths[0]); - PetscOptionsBegin(user->comm, NULL, "SGS Data-Driven Training Options", NULL); - PetscCall(PetscOptionsInt("-sgs_train_write_data_interval", "Number of timesteps between writing data into database", NULL, - sgs_dd_train->write_data_interval, &sgs_dd_train->write_data_interval, NULL)); - PetscCall(PetscOptionsBool("-sgs_train_overwrite_data", "Overwrite old training data in the database", NULL, sgs_dd_train->overwrite_training_data, - &sgs_dd_train->overwrite_training_data, NULL)); - PetscCall(PetscOptionsRealArray("-sgs_train_filter_width_scales", "Scales of each filter width put into training database", NULL, - sgs_dd_train->filter_widths, &sgs_dd_train->num_filter_widths, NULL)); - PetscOptionsEnd(); - - // -- Create DM for storing training data - PetscCall(SGS_DD_TrainingCreateDM(user->dm, &sgs_dd_train->dm_dd_training, user->app_ctx->degree, user->app_ctx->q_extra, - &sgs_dd_train->num_comp_dd_inputs)); - - { // -- Create QFunction Context - NewtonianIdealGasContext gas; - PetscCallCeed(ceed, CeedQFunctionContextGetDataRead(problem->apply_vol_ifunction.qfunction_context, CEED_MEM_HOST, &gas)); - sgsdd_train_qfctx->gas = *gas; - PetscCallCeed(ceed, CeedQFunctionContextRestoreDataRead(problem->apply_vol_ifunction.qfunction_context, &gas)); - PetscCallCeed(ceed, CeedQFunctionContextCreate(user->ceed, &sgs_dd_train_setup_data->sgs_dd_train_qfctx)); - PetscCallCeed(ceed, CeedQFunctionContextSetData(sgs_dd_train_setup_data->sgs_dd_train_qfctx, CEED_MEM_HOST, CEED_USE_POINTER, - sizeof(*sgsdd_train_qfctx), sgsdd_train_qfctx)); - PetscCallCeed(ceed, CeedQFunctionContextSetDataDestroy(sgs_dd_train_setup_data->sgs_dd_train_qfctx, CEED_MEM_HOST, FreeContextPetsc)); - } - - { // -- Send training data array info to SmartRedis database - PetscMPIInt rank, num_ranks; - SmartSimData smartsim = user->smartsim; - PetscCallMPI(MPI_Comm_rank(user->comm, &rank)); - PetscCallMPI(MPI_Comm_size(user->comm, &num_ranks)); - - { - PetscSection global_section; - PetscInt num_dofs, num_comps, local_min_max[2] = {0.}, global_min_max[2] = {0.}; - - PetscCall(DMGetGlobalSection(sgs_dd_train->dm_dd_training, &global_section)); - PetscCall(DMGetGlobalVectorInfo(sgs_dd_train->dm_dd_training, &num_dofs, NULL, NULL)); - PetscCall(PetscSectionGetFieldComponents(global_section, 0, &num_comps)); - local_min_max[0] = num_dofs; - PetscCall(PetscGlobalMinMaxInt(user->comm, local_min_max, global_min_max)); - - sgs_dd_train->training_data_array_dims[0] = global_min_max[0] / num_comps; - sgs_dd_train->training_data_array_dims[1] = num_comps; - } - - if (rank % smartsim->collocated_database_num_ranks == 0) { - { // Communicate info on simulation size - const char tensor_name[] = "sizeInfo"; - size_t array_info_dim = 6; - PetscInt64 array_info[6] = {0}, num_features = 6; - - array_info[0] = sgs_dd_train->training_data_array_dims[0]; - array_info[1] = sgs_dd_train->training_data_array_dims[1]; - array_info[2] = num_features; - array_info[3] = num_ranks; - array_info[4] = smartsim->collocated_database_num_ranks; - array_info[5] = rank; - - PetscCall(PetscLogEventBegin(FLUIDS_SmartRedis_Meta, 0, 0, 0, 0)); - PetscCallSmartRedis( - put_tensor(smartsim->client, tensor_name, strlen(tensor_name), array_info, &array_info_dim, 1, SRTensorTypeInt64, SRMemLayoutContiguous)); - PetscCall(SmartRedisVerifyPutTensor(smartsim->client, tensor_name, strlen(tensor_name))); - PetscCall(PetscLogEventEnd(FLUIDS_SmartRedis_Meta, 0, 0, 0, 0)); - } - - { // Send array that communicates if tensors are overwritten in database - const char tensor_name[] = "tensor-ow"; - PetscInt64 tensor_overwrite[2] = {sgs_dd_train->overwrite_training_data}; - size_t dim_2[1] = {2}; - - PetscCall(PetscLogEventBegin(FLUIDS_SmartRedis_Meta, 0, 0, 0, 0)); - PetscCallSmartRedis( - put_tensor(smartsim->client, tensor_name, strlen(tensor_name), tensor_overwrite, dim_2, 1, SRTensorTypeInt64, SRMemLayoutContiguous)); - PetscCall(SmartRedisVerifyPutTensor(smartsim->client, tensor_name, strlen(tensor_name))); - PetscCall(PetscLogEventEnd(FLUIDS_SmartRedis_Meta, 0, 0, 0, 0)); - } - - { // Communicate number of filter widths used - const char tensor_name[] = "num_filter_widths"; - PetscInt64 num_filter_widths = sgs_dd_train->num_filter_widths; - size_t dim_2 = 1; - - PetscCall(PetscLogEventBegin(FLUIDS_SmartRedis_Meta, 0, 0, 0, 0)); - PetscCallSmartRedis( - put_tensor(smartsim->client, tensor_name, strlen(tensor_name), &num_filter_widths, &dim_2, 1, SRTensorTypeInt64, SRMemLayoutContiguous)); - PetscCall(SmartRedisVerifyPutTensor(smartsim->client, tensor_name, strlen(tensor_name))); - PetscCall(PetscLogEventEnd(FLUIDS_SmartRedis_Meta, 0, 0, 0, 0)); - } - } - } - - // -- Compute and store anisotropy tensor - PetscCall(GridAnisotropyTensorProjectionSetupApply(ceed, user, ceed_data, &sgs_dd_train_setup_data->elem_restr_grid_aniso, - &sgs_dd_train_setup_data->grid_aniso_ceed)); - - // -- Create Nodal Evaluation Operator - PetscCall(SetupTrainingDataCalculation(ceed, user, ceed_data, problem, sgs_dd_train_setup_data)); - - PetscCall(SGS_DD_TrainingSetupDataDestroy(sgs_dd_train_setup_data)); - PetscFunctionReturn(PETSC_SUCCESS); -} - -PetscErrorCode TSMonitor_SGS_DD_Training(TS ts, PetscInt step_num, PetscReal solution_time, Vec Q, void *ctx) { - User user = (User)ctx; - Ceed ceed = user->ceed; - SGS_DD_TrainingData sgs_dd_train = user->sgs_dd_train; - SmartSimData smartsim = user->smartsim; - Vec TrainingData; - PetscMPIInt rank; - - PetscFunctionBeginUser; - - PetscCallMPI(MPI_Comm_rank(user->comm, &rank)); - - if (step_num % sgs_dd_train->write_data_interval != 0) PetscFunctionReturn(PETSC_SUCCESS); - PetscCall(DMGetGlobalVector(sgs_dd_train->dm_dd_training, &TrainingData)); - - for (PetscInt filter_index = 0; filter_index < sgs_dd_train->num_filter_widths; filter_index++) { - PetscCall(PetscLogEventBegin(FLUIDS_TrainDataCompute, 0, 0, 0, 0)); - { // -- Compute and assemble training data - Vec FilteredVelocityGradient, FilteredFields, FilteredFields_loc; - PetscMemType filtered_fields_mem_type; - CeedVector filtered_fields; - - { // Set filter width for the current solve - double filter_width_scaling[3]; - CeedOperator op_mat; - Mat A_mat; - - for (int j = 0; j < 3; j++) filter_width_scaling[j] = sgs_dd_train->filter_widths[filter_index]; - PetscCall(KSPGetOperators(user->diff_filter->ksp, &A_mat, NULL)); - PetscCall(MatCeedGetCeedOperators(A_mat, &op_mat, NULL)); - PetscCall(CeedOperatorSetContextDouble(op_mat, user->diff_filter->filter_width_scaling_label, filter_width_scaling)); - } - - PetscCall(DMGetGlobalVector(user->diff_filter->dm_filter, &FilteredFields)); - PetscCall(DMGetLocalVector(user->diff_filter->dm_filter, &FilteredFields_loc)); - - PetscCall(DifferentialFilterApply(user, solution_time, Q, FilteredFields)); - PetscCall(DMGlobalToLocal(user->diff_filter->dm_filter, FilteredFields, INSERT_VALUES, FilteredFields_loc)); - - PetscCall(DMGetGlobalVector(sgs_dd_train->filtered_grad_velo_proj->dm, &FilteredVelocityGradient)); - PetscCall(VelocityGradientProjectionApply(sgs_dd_train->filtered_grad_velo_proj, FilteredFields_loc, FilteredVelocityGradient)); - - { - CeedOperatorField op_field; - - PetscCallCeed(ceed, CeedOperatorGetFieldByName(sgs_dd_train->op_training_data_calc_ctx->op, "q", &op_field)); - PetscCallCeed(ceed, CeedOperatorFieldGetVector(op_field, &filtered_fields)); - } - - PetscCall(VecPetscToCeed(FilteredFields_loc, &filtered_fields_mem_type, filtered_fields)); // filtered_fields is an implicit input - PetscCall(ApplyCeedOperatorGlobalToGlobal(FilteredVelocityGradient, TrainingData, sgs_dd_train->op_training_data_calc_ctx)); - PetscCall(VecCeedToPetsc(filtered_fields, filtered_fields_mem_type, FilteredFields_loc)); - - PetscCall(DMRestoreGlobalVector(sgs_dd_train->filtered_grad_velo_proj->dm, &FilteredVelocityGradient)); - PetscCall(DMRestoreGlobalVector(user->diff_filter->dm_filter, &FilteredFields)); - PetscCall(DMRestoreLocalVector(user->diff_filter->dm_filter, &FilteredFields_loc)); - } - PetscCall(PetscLogEventEnd(FLUIDS_TrainDataCompute, 0, 0, 0, 0)); - - { // -- Send training data to SmartSim - char array_key[PETSC_MAX_PATH_LEN]; - size_t array_key_len; - - if (sgs_dd_train->overwrite_training_data) { - PetscCall(PetscSNPrintf(array_key, sizeof array_key, "%s.%" PetscInt_FMT, smartsim->rank_id_name, filter_index)); - } else { - PetscCall(PetscSNPrintf(array_key, sizeof array_key, "%s.%" PetscInt_FMT "%" PetscInt_FMT, smartsim->rank_id_name, step_num, filter_index)); - } - PetscCall(PetscStrlen(array_key, &array_key_len)); - - { - const PetscScalar *training_data; - PetscCall(VecGetArrayRead(TrainingData, &training_data)); - PetscCall(PetscLogEventBegin(FLUIDS_SmartRedis_Train, 0, 0, 0, 0)); - PetscCallSmartRedis(put_tensor(smartsim->client, array_key, array_key_len, (void *)training_data, sgs_dd_train->training_data_array_dims, 2, - SRTensorTypeDouble, SRMemLayoutContiguous)); - PetscCall(PetscLogEventEnd(FLUIDS_SmartRedis_Train, 0, 0, 0, 0)); - PetscCall(VecRestoreArrayRead(TrainingData, &training_data)); - } - } - } - - if (rank % smartsim->collocated_database_num_ranks == 0) { - const char tensor_name[] = "step"; - size_t dim_2[1] = {2}; - PetscInt64 step_array[2] = {step_num, step_num}; - - PetscCall(PetscLogEventBegin(FLUIDS_SmartRedis_Meta, 0, 0, 0, 0)); - PetscCallSmartRedis( - put_tensor(smartsim->client, tensor_name, strlen(tensor_name), step_array, dim_2, 1, SRTensorTypeInt64, SRMemLayoutContiguous)); - PetscCall(PetscLogEventEnd(FLUIDS_SmartRedis_Meta, 0, 0, 0, 0)); - } - - PetscCall(DMRestoreGlobalVector(user->sgs_dd_train->dm_dd_training, &TrainingData)); - PetscFunctionReturn(PETSC_SUCCESS); -} - -PetscErrorCode TSPostStep_SGS_DD_Training(TS ts) { - User user; - const char check_run_key[] = "check-run"; - PetscReal check_run[2] = {1}; - const size_t check_run_dims[1] = {2}; - size_t check_run_key_size; - - PetscFunctionBeginUser; - PetscCall(PetscStrlen(check_run_key, &check_run_key_size)); - PetscCall(TSGetApplicationContext(ts, &user)); - SmartSimData smartsim = user->smartsim; - - PetscCall(PetscLogEventBegin(FLUIDS_SmartRedis_Meta, 0, 0, 0, 0)); - PetscCallSmartRedis( - unpack_tensor(smartsim->client, check_run_key, check_run_key_size, check_run, check_run_dims, 1, SRTensorTypeDouble, SRMemLayoutContiguous)); - PetscCall(PetscLogEventEnd(FLUIDS_SmartRedis_Meta, 0, 0, 0, 0)); - if (check_run[0] == 0) { - PetscCall(PetscPrintf(user->comm, "-- Simulation stopped by 'check-run' tensor in Redis database\n")); - PetscCall(TSSetConvergedReason(ts, TS_CONVERGED_USER)); - } - - PetscFunctionReturn(PETSC_SUCCESS); -} - -PetscErrorCode SGS_DD_TrainingDataDestroy(SGS_DD_TrainingData sgs_dd_train) { - PetscFunctionBeginUser; - if (!sgs_dd_train) PetscFunctionReturn(PETSC_SUCCESS); - - PetscCall(OperatorApplyContextDestroy(sgs_dd_train->op_training_data_calc_ctx)); - PetscCall(NodalProjectionDataDestroy(sgs_dd_train->filtered_grad_velo_proj)); - PetscCall(DMDestroy(&sgs_dd_train->dm_dd_training)); - PetscCall(PetscFree(sgs_dd_train)); - - PetscFunctionReturn(PETSC_SUCCESS); -} diff --git a/examples/fluids/src/smartsim/smartsim.c b/examples/fluids/src/smartsim/smartsim.c deleted file mode 100644 index 03ddab9606..0000000000 --- a/examples/fluids/src/smartsim/smartsim.c +++ /dev/null @@ -1,76 +0,0 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. -// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. -// -// SPDX-License-Identifier: BSD-2-Clause -// -// This file is part of CEED: http://github.com/ceed -// Based on the instructions from https://www.craylabs.org/docs/sr_integration.html and PHASTA implementation - -#include "../../include/smartsim.h" - -#include "../../navierstokes.h" - -PetscErrorCode SmartRedisVerifyPutTensor(void *c_client, const char *name, const size_t name_length) { - bool does_exist = true; - - PetscFunctionBeginUser; - PetscCall(PetscLogEventBegin(FLUIDS_SmartRedis_Meta, 0, 0, 0, 0)); - PetscCallSmartRedis(tensor_exists(c_client, name, name_length, &does_exist)); - PetscCheck(does_exist, PETSC_COMM_SELF, -1, "Tensor of name '%s' was not written to the database successfully", name); - PetscCall(PetscLogEventEnd(FLUIDS_SmartRedis_Meta, 0, 0, 0, 0)); - PetscFunctionReturn(PETSC_SUCCESS); -} - -PetscErrorCode SmartSimTrainingSetup(User user) { - SmartSimData smartsim = user->smartsim; - PetscMPIInt rank; - PetscReal checkrun[2] = {1}; - size_t dim_2[1] = {2}; - - PetscFunctionBeginUser; - PetscCallMPI(MPI_Comm_rank(user->comm, &rank)); - - if (rank % smartsim->collocated_database_num_ranks == 0) { - // -- Send array that communicates when ML is done training - PetscCall(PetscLogEventBegin(FLUIDS_SmartRedis_Meta, 0, 0, 0, 0)); - PetscCallSmartRedis(put_tensor(smartsim->client, "check-run", 9, checkrun, dim_2, 1, SRTensorTypeDouble, SRMemLayoutContiguous)); - PetscCall(SmartRedisVerifyPutTensor(smartsim->client, "check-run", 9)); - PetscCall(PetscLogEventEnd(FLUIDS_SmartRedis_Meta, 0, 0, 0, 0)); - } - PetscFunctionReturn(PETSC_SUCCESS); -} - -PetscErrorCode SmartSimSetup(User user) { - PetscMPIInt rank; - PetscInt num_orchestrator_nodes = 1; - - PetscFunctionBeginUser; - PetscCall(PetscNew(&user->smartsim)); - SmartSimData smartsim = user->smartsim; - - smartsim->collocated_database_num_ranks = 1; - PetscOptionsBegin(user->comm, NULL, "Options for SmartSim integration", NULL); - PetscCall(PetscOptionsInt("-smartsim_collocated_database_num_ranks", "Number of ranks per collocated database instance", NULL, - smartsim->collocated_database_num_ranks, &smartsim->collocated_database_num_ranks, NULL)); - PetscOptionsEnd(); - - // Create prefix to be put on tensor names - PetscCallMPI(MPI_Comm_rank(user->comm, &rank)); - PetscCall(PetscSNPrintf(smartsim->rank_id_name, sizeof(smartsim->rank_id_name), "y.%d", rank)); - - PetscCall(PetscLogEventBegin(FLUIDS_SmartRedis_Init, 0, 0, 0, 0)); - PetscCallSmartRedis(SmartRedisCClient(num_orchestrator_nodes != 1, smartsim->rank_id_name, strlen(smartsim->rank_id_name), &smartsim->client)); - PetscCall(PetscLogEventEnd(FLUIDS_SmartRedis_Init, 0, 0, 0, 0)); - - PetscCall(SmartSimTrainingSetup(user)); - PetscFunctionReturn(PETSC_SUCCESS); -} - -PetscErrorCode SmartSimDataDestroy(SmartSimData smartsim) { - PetscFunctionBeginUser; - if (!smartsim) PetscFunctionReturn(PETSC_SUCCESS); - - PetscCallSmartRedis(DeleteCClient(&smartsim->client)); - PetscCall(PetscFree(smartsim)); - PetscFunctionReturn(PETSC_SUCCESS); -} diff --git a/examples/fluids/src/smartsim_weak.c b/examples/fluids/src/smartsim_weak.c deleted file mode 100644 index 9c97419a8c..0000000000 --- a/examples/fluids/src/smartsim_weak.c +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. -// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. -// -// SPDX-License-Identifier: BSD-2-Clause -// -// This file is part of CEED: http://github.com/ceed -// -// @file This creates weak functions for smartsim dependent functions. If the smartsim-dependent functions are actually built, these functions are not -// linked to the final executable. - -#include "../navierstokes.h" - -PetscErrorCode SGS_DD_TrainingSetup(Ceed ceed, User user, CeedData ceed_data, ProblemData problem) __attribute__((weak)); -PetscErrorCode SGS_DD_TrainingSetup(Ceed ceed, User user, CeedData ceed_data, ProblemData problem) { - PetscFunctionBeginUser; - SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_SUP, "Must build with SMARTREDIS_DIR set to run %s", __func__); -}; - -PetscErrorCode TSMonitor_SGS_DD_Training(TS ts, PetscInt step_num, PetscReal solution_time, Vec Q, void *ctx) __attribute__((weak)); -PetscErrorCode TSMonitor_SGS_DD_Training(TS ts, PetscInt step_num, PetscReal solution_time, Vec Q, void *ctx) { - PetscFunctionBeginUser; - SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_SUP, "Must build with SMARTREDIS_DIR set to run %s", __func__); -}; - -PetscErrorCode SGS_DD_TrainingDataDestroy(SGS_DD_TrainingData sgs_dd_train) __attribute__((weak)); -PetscErrorCode SGS_DD_TrainingDataDestroy(SGS_DD_TrainingData sgs_dd_train) { - PetscFunctionBeginUser; - if (!sgs_dd_train) PetscFunctionReturn(PETSC_SUCCESS); - PetscCall(PetscPrintf(PETSC_COMM_WORLD, "Warning: SGS_DD_TrainingData struct should not be initialized if SMARTREDIS_DIR isn't set on build...")); - PetscFunctionReturn(PETSC_SUCCESS); -} - -PetscErrorCode TSPostStep_SGS_DD_Training(TS ts) __attribute__((weak)); -PetscErrorCode TSPostStep_SGS_DD_Training(TS ts) { - PetscFunctionBeginUser; - SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_SUP, "Must build with SMARTREDIS_DIR set to run %s", __func__); -}; - -PetscErrorCode SmartSimDataDestroy(SmartSimData smartsim) __attribute__((weak)); -PetscErrorCode SmartSimDataDestroy(SmartSimData smartsim) { - PetscFunctionBeginUser; - if (!smartsim) PetscFunctionReturn(PETSC_SUCCESS); - PetscCall(PetscPrintf(PETSC_COMM_WORLD, "Warning: SmartSimData struct should not be initialized if SMARTREDIS_DIR isn't set on build...")); - - PetscFunctionReturn(PETSC_SUCCESS); -} diff --git a/examples/fluids/src/strong_boundary_conditions.c b/examples/fluids/src/strong_boundary_conditions.c index 9bcc753885..76bee17d39 100644 --- a/examples/fluids/src/strong_boundary_conditions.c +++ b/examples/fluids/src/strong_boundary_conditions.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -26,8 +26,21 @@ PetscErrorCode SetupStrongSTG_Ceed(Ceed ceed, CeedData ceed_data, DM dm, Problem PetscFunctionBeginUser; PetscCall(DMGetLabel(dm, "Face Sets", &domain_label)); - // Basis - PetscCallCeed(ceed, CeedBasisCreateProjection(ceed_data->basis_x_sur, ceed_data->basis_q_sur, &basis_x_to_q_sur)); + { // Basis + CeedBasis basis_x_sur, basis_q_sur; + DM dm_coord; + + PetscCall(DMGetCoordinateDM(dm, &dm_coord)); + DMLabel label = NULL; + PetscInt label_value = 0; + PetscCall(CreateBasisFromPlex(ceed, dm, label, label_value, height, dm_field, &basis_q_sur)); + PetscCall(CreateBasisFromPlex(ceed, dm_coord, label, label_value, height, dm_field, &basis_x_sur)); + + PetscCallCeed(ceed, CeedBasisCreateProjection(basis_x_sur, basis_q_sur, &basis_x_to_q_sur)); + + PetscCallCeed(ceed, CeedBasisDestroy(&basis_q_sur)); + PetscCallCeed(ceed, CeedBasisDestroy(&basis_x_sur)); + } // Setup QFunction PetscCallCeed(ceed, CeedQFunctionCreateInterior(ceed, 1, SetupStrongBC, SetupStrongBC_loc, &qf_setup)); @@ -40,6 +53,7 @@ PetscErrorCode SetupStrongSTG_Ceed(Ceed ceed, CeedData ceed_data, DM dm, Problem // Setup STG Setup QFunction PetscCall(SetupStrongStg_PreProcessing(ceed, problem, num_comp_x, stg_data_size, dXdx_size, &qf_stgdata)); + PetscCall(SetupStrongStg_QF(ceed, problem, num_comp_x, num_comp_q, stg_data_size, dXdx_size, &qf_strongbc)); // Compute contribution on each boundary face for (CeedInt i = 0; i < bc->num_inflow; i++) { @@ -79,8 +93,7 @@ PetscErrorCode SetupStrongSTG_Ceed(Ceed ceed, CeedData ceed_data, DM dm, Problem PetscCallCeed(ceed, CeedOperatorApply(op_stgdata, CEED_VECTOR_NONE, stg_data, CEED_REQUEST_IMMEDIATE)); - // -- Setup BC QFunctions - PetscCall(SetupStrongStg_QF(ceed, problem, num_comp_x, num_comp_q, stg_data_size, dXdx_size, &qf_strongbc)); + // -- Setup BC Sub Operator PetscCallCeed(ceed, CeedOperatorCreate(ceed, qf_strongbc, NULL, NULL, &op_strong_bc_sub)); PetscCallCeed(ceed, CeedOperatorSetName(op_strong_bc_sub, "Strong STG")); @@ -91,7 +104,7 @@ PetscErrorCode SetupStrongSTG_Ceed(Ceed ceed, CeedData ceed_data, DM dm, Problem PetscCallCeed(ceed, CeedOperatorSetField(op_strong_bc_sub, "q", elem_restr_q_sur, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE)); // -- Add to composite operator - PetscCallCeed(ceed, CeedCompositeOperatorAddSub(op_strong_bc, op_strong_bc_sub)); + PetscCallCeed(ceed, CeedOperatorCompositeAddSub(op_strong_bc, op_strong_bc_sub)); PetscCallCeed(ceed, CeedVectorDestroy(&multiplicity)); PetscCallCeed(ceed, CeedVectorDestroy(&x_stored)); @@ -104,8 +117,6 @@ PetscErrorCode SetupStrongSTG_Ceed(Ceed ceed, CeedData ceed_data, DM dm, Problem PetscCallCeed(ceed, CeedElemRestrictionDestroy(&elem_restr_scale)); PetscCallCeed(ceed, CeedElemRestrictionDestroy(&elem_restr_stgdata)); PetscCallCeed(ceed, CeedElemRestrictionDestroy(&elem_restr_dXdx)); - PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_strongbc)); - PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_stgdata)); PetscCallCeed(ceed, CeedOperatorDestroy(&op_strong_bc_sub)); PetscCallCeed(ceed, CeedOperatorDestroy(&op_setup)); PetscCallCeed(ceed, CeedOperatorDestroy(&op_stgdata)); @@ -114,6 +125,8 @@ PetscErrorCode SetupStrongSTG_Ceed(Ceed ceed, CeedData ceed_data, DM dm, Problem PetscCallCeed(ceed, CeedOperatorGetContextFieldLabel(op_strong_bc, "solution time", &phys->stg_solution_time_label)); PetscCallCeed(ceed, CeedBasisDestroy(&basis_x_to_q_sur)); + PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_strongbc)); + PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_stgdata)); PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_setup)); PetscFunctionReturn(PETSC_SUCCESS); } @@ -155,7 +168,7 @@ PetscErrorCode SetupStrongBC_Ceed(Ceed ceed, CeedData ceed_data, DM dm, User use PetscCall(DMRestoreGlobalVector(dm, &global_vec)); } - PetscCallCeed(ceed, CeedCompositeOperatorCreate(ceed, &op_strong_bc)); + PetscCallCeed(ceed, CeedOperatorCreateComposite(ceed, &op_strong_bc)); { PetscBool use_strongstg = PETSC_FALSE; PetscCall(PetscOptionsGetBool(NULL, NULL, "-stg_strong", &use_strongstg, NULL)); @@ -168,5 +181,6 @@ PetscErrorCode SetupStrongBC_Ceed(Ceed ceed, CeedData ceed_data, DM dm, User use PetscCall(OperatorApplyContextCreate(NULL, NULL, ceed, op_strong_bc, CEED_VECTOR_NONE, NULL, NULL, NULL, &user->op_strong_bc_ctx)); PetscCall(PetscObjectComposeFunction((PetscObject)dm, "DMPlexInsertBoundaryValues_C", DMPlexInsertBoundaryValues_StrongBCCeed)); + PetscCallCeed(ceed, CeedOperatorDestroy(&op_strong_bc)); PetscFunctionReturn(PETSC_SUCCESS); } diff --git a/examples/fluids/src/turb_spanstats.c b/examples/fluids/src/turb_spanstats.c index 54ab617afc..942efc38a7 100644 --- a/examples/fluids/src/turb_spanstats.c +++ b/examples/fluids/src/turb_spanstats.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -21,9 +21,9 @@ #include "../navierstokes.h" typedef struct { - CeedElemRestriction elem_restr_parent_x, elem_restr_parent_stats, elem_restr_parent_qd, elem_restr_parent_colloc, elem_restr_child_colloc; + CeedElemRestriction elem_restr_parent_x, elem_restr_parent_stats, elem_restr_parent_colloc, elem_restr_child_colloc; CeedBasis basis_x, basis_stats; - CeedVector x_coord, q_data; + CeedVector x_coord; } *SpanStatsSetupData; PetscErrorCode CreateStatsDM(User user, ProblemData problem, PetscInt degree) { @@ -40,7 +40,7 @@ PetscErrorCode CreateStatsDM(User user, ProblemData problem, PetscInt degree) { // Get spanwise length PetscCall(DMGetBoundingBox(user->dm, domain_min, domain_max)); - user->spanstats.span_width = domain_max[2] - domain_min[1]; + user->spanstats.span_width = domain_max[2] - domain_min[2]; { // Get DM from surface DM parent_distributed_dm; @@ -65,6 +65,7 @@ PetscErrorCode CreateStatsDM(User user, ProblemData problem, PetscInt degree) { for (PetscInt i = 0; i < nleaves; i++) { PetscCall(DMLabelSetValue(label, ilocal[i], 1)); } + PetscCall(PetscSFDestroy(&inv_isoperiodicface)); } else { PetscCall(DMGetLabel(user->dm, "Face Sets", &label)); } @@ -169,7 +170,7 @@ PetscErrorCode GetQuadratureCoords(Ceed ceed, DM dm, CeedElemRestriction elem_re PetscCallCeed(ceed, CeedOperatorSetField(op_quad_coords, "input", elem_restr_x, basis_x, x_coords)); PetscCallCeed(ceed, CeedOperatorSetField(op_quad_coords, "output", elem_restr_qx, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE)); - PetscCall(CeedOperatorCreateLocalVecs(op_quad_coords, DMReturnVecType(dm), PetscObjectComm((PetscObject)dm), NULL, Qx_coords)); + PetscCall(CeedOperatorCreateLocalVecs(op_quad_coords, DMReturnVecType(dm), PETSC_COMM_SELF, NULL, Qx_coords)); PetscCall(OperatorApplyContextCreate(NULL, NULL, ceed, op_quad_coords, CEED_VECTOR_NONE, NULL, NULL, NULL, &op_quad_coords_ctx)); PetscCall(ApplyCeedOperatorLocalToLocal(NULL, *Qx_coords, op_quad_coords_ctx)); @@ -183,7 +184,6 @@ PetscErrorCode GetQuadratureCoords(Ceed ceed, DM dm, CeedElemRestriction elem_re PetscErrorCode SpanStatsSetupDataCreate(Ceed ceed, User user, CeedData ceed_data, ProblemData problem, SpanStatsSetupData *stats_data) { DM dm = user->spanstats.dm; - PetscInt dim; CeedInt num_comp_x, num_comp_stats = user->spanstats.num_comp_stats; Vec X_loc; DMLabel domain_label = NULL; @@ -192,14 +192,10 @@ PetscErrorCode SpanStatsSetupDataCreate(Ceed ceed, User user, CeedData ceed_data PetscFunctionBeginUser; PetscCall(PetscNew(stats_data)); - PetscCall(DMGetDimension(dm, &dim)); PetscCall(DMPlexCeedElemRestrictionCreate(ceed, dm, domain_label, label_value, height, dm_field, &(*stats_data)->elem_restr_parent_stats)); PetscCall(DMPlexCeedElemRestrictionCoordinateCreate(ceed, dm, domain_label, label_value, height, &(*stats_data)->elem_restr_parent_x)); - PetscCall(DMPlexCeedElemRestrictionQDataCreate(ceed, dm, domain_label, label_value, height, problem->q_data_size_sur, - &(*stats_data)->elem_restr_parent_qd)); PetscCallCeed(ceed, CeedElemRestrictionGetNumComponents((*stats_data)->elem_restr_parent_x, &num_comp_x)); PetscCallCeed(ceed, CeedElemRestrictionCreateVector((*stats_data)->elem_restr_parent_x, &(*stats_data)->x_coord, NULL)); - PetscCallCeed(ceed, CeedElemRestrictionCreateVector((*stats_data)->elem_restr_parent_qd, &(*stats_data)->q_data, NULL)); { DM dm_coord; @@ -210,8 +206,8 @@ PetscErrorCode SpanStatsSetupDataCreate(Ceed ceed, User user, CeedData ceed_data PetscCall(CreateElemRestrColloc_CompMajor(ceed, num_comp_stats, (*stats_data)->basis_stats, (*stats_data)->elem_restr_parent_stats, &(*stats_data)->elem_restr_parent_colloc)); - PetscCall( - CreateElemRestrColloc_CompMajor(ceed, num_comp_stats, ceed_data->basis_q, ceed_data->elem_restr_q, &(*stats_data)->elem_restr_child_colloc)); + PetscCall(CreateElemRestrColloc_CompMajor(ceed, num_comp_stats, ceed_data->basis_q, ceed_data->elem_restr_q, + &(*stats_data)->elem_restr_child_colloc)); { // -- Copy DM coordinates into CeedVector DM cdm; @@ -234,7 +230,6 @@ PetscErrorCode SpanStatsSetupDataDestroy(SpanStatsSetupData data) { PetscCall(CeedElemRestrictionGetCeed(data->elem_restr_parent_x, &ceed)); PetscCallCeed(ceed, CeedElemRestrictionDestroy(&data->elem_restr_parent_x)); PetscCallCeed(ceed, CeedElemRestrictionDestroy(&data->elem_restr_parent_stats)); - PetscCallCeed(ceed, CeedElemRestrictionDestroy(&data->elem_restr_parent_qd)); PetscCallCeed(ceed, CeedElemRestrictionDestroy(&data->elem_restr_parent_colloc)); PetscCallCeed(ceed, CeedElemRestrictionDestroy(&data->elem_restr_child_colloc)); @@ -242,7 +237,8 @@ PetscErrorCode SpanStatsSetupDataDestroy(SpanStatsSetupData data) { PetscCallCeed(ceed, CeedBasisDestroy(&data->basis_stats)); PetscCallCeed(ceed, CeedVectorDestroy(&data->x_coord)); - PetscCallCeed(ceed, CeedVectorDestroy(&data->q_data)); + + PetscCheck(CeedDestroy(&ceed) == CEED_ERROR_SUCCESS, PETSC_COMM_WORLD, PETSC_ERR_LIB, "Destroying Ceed object failed"); PetscCall(PetscFree(data)); PetscFunctionReturn(PETSC_SUCCESS); @@ -298,10 +294,13 @@ PetscErrorCode CreateStatsSF(Ceed ceed, CeedData ceed_data, SpanStatsSetupData s // @brief Setup RHS and LHS for L^2 projection of statistics PetscErrorCode SetupL2ProjectionStats(Ceed ceed, User user, CeedData ceed_data, SpanStatsSetupData stats_data) { - CeedOperator op_mass, op_setup_sur, op_proj_rhs; - CeedQFunction qf_mass, qf_stats_proj; - CeedInt q_data_size, num_comp_stats = user->spanstats.num_comp_stats; - MPI_Comm comm = PetscObjectComm((PetscObject)user->spanstats.dm); + CeedOperator op_mass, op_proj_rhs; + CeedQFunction qf_mass, qf_stats_proj; + CeedInt q_data_size, num_comp_stats = user->spanstats.num_comp_stats; + CeedElemRestriction elem_restr_qd; + CeedVector q_data; + DMLabel domain_label = NULL; + PetscInt label_value = 0; PetscFunctionBeginUser; // -- Create Operator for RHS of L^2 projection of statistics @@ -314,33 +313,24 @@ PetscErrorCode SetupL2ProjectionStats(Ceed ceed, User user, CeedData ceed_data, PetscCallCeed(ceed, CeedOperatorSetField(op_proj_rhs, "output", stats_data->elem_restr_parent_stats, stats_data->basis_stats, CEED_VECTOR_ACTIVE)); PetscCall(OperatorApplyContextCreate(NULL, user->spanstats.dm, ceed, op_proj_rhs, NULL, NULL, NULL, NULL, &user->spanstats.op_proj_rhs_ctx)); - PetscCall(CeedOperatorCreateLocalVecs(op_proj_rhs, DMReturnVecType(user->spanstats.dm), comm, &user->spanstats.Parent_Stats_loc, NULL)); - - // -- Setup LHS of L^2 projection - // Get q_data for mass matrix operator - PetscCallCeed(ceed, CeedOperatorCreate(ceed, ceed_data->qf_setup_sur, NULL, NULL, &op_setup_sur)); - PetscCallCeed(ceed, CeedOperatorSetField(op_setup_sur, "dx", stats_data->elem_restr_parent_x, stats_data->basis_x, CEED_VECTOR_ACTIVE)); - PetscCallCeed(ceed, CeedOperatorSetField(op_setup_sur, "weight", CEED_ELEMRESTRICTION_NONE, stats_data->basis_x, CEED_VECTOR_NONE)); - PetscCallCeed(ceed, CeedOperatorSetField(op_setup_sur, "surface qdata", stats_data->elem_restr_parent_qd, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE)); - PetscCallCeed(ceed, CeedOperatorApply(op_setup_sur, stats_data->x_coord, stats_data->q_data, CEED_REQUEST_IMMEDIATE)); - - // CEED Restriction - PetscCallCeed(ceed, CeedElemRestrictionGetNumComponents(stats_data->elem_restr_parent_qd, &q_data_size)); + PetscCall(CeedOperatorCreateLocalVecs(op_proj_rhs, DMReturnVecType(user->spanstats.dm), PETSC_COMM_SELF, &user->spanstats.Parent_Stats_loc, NULL)); + PetscCall(QDataGet(ceed, user->spanstats.dm, domain_label, label_value, stats_data->elem_restr_parent_x, stats_data->basis_x, stats_data->x_coord, + &elem_restr_qd, &q_data, &q_data_size)); // Create Mass CeedOperator PetscCall(CreateMassQFunction(ceed, num_comp_stats, q_data_size, &qf_mass)); PetscCallCeed(ceed, CeedOperatorCreate(ceed, qf_mass, NULL, NULL, &op_mass)); PetscCallCeed(ceed, CeedOperatorSetField(op_mass, "u", stats_data->elem_restr_parent_stats, stats_data->basis_stats, CEED_VECTOR_ACTIVE)); - PetscCallCeed(ceed, CeedOperatorSetField(op_mass, "qdata", stats_data->elem_restr_parent_qd, CEED_BASIS_NONE, stats_data->q_data)); + PetscCallCeed(ceed, CeedOperatorSetField(op_mass, "qdata", elem_restr_qd, CEED_BASIS_NONE, q_data)); PetscCallCeed(ceed, CeedOperatorSetField(op_mass, "v", stats_data->elem_restr_parent_stats, stats_data->basis_stats, CEED_VECTOR_ACTIVE)); { // Setup KSP for L^2 projection Mat mat_mass; KSP ksp; - PetscCall(MatCeedCreate(user->spanstats.dm, user->spanstats.dm, op_mass, NULL, &mat_mass)); + PetscCall(MatCreateCeed(user->spanstats.dm, user->spanstats.dm, op_mass, NULL, &mat_mass)); - PetscCall(KSPCreate(comm, &ksp)); + PetscCall(KSPCreate(PetscObjectComm((PetscObject)user->spanstats.dm), &ksp)); PetscCall(KSPSetOptionsPrefix(ksp, "turbulence_spanstats_")); { PC pc; @@ -357,10 +347,11 @@ PetscErrorCode SetupL2ProjectionStats(Ceed ceed, User user, CeedData ceed_data, } // Cleanup + PetscCallCeed(ceed, CeedElemRestrictionDestroy(&elem_restr_qd)); + PetscCallCeed(ceed, CeedVectorDestroy(&q_data)); PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_mass)); PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_stats_proj)); PetscCallCeed(ceed, CeedOperatorDestroy(&op_mass)); - PetscCallCeed(ceed, CeedOperatorDestroy(&op_setup_sur)); PetscCallCeed(ceed, CeedOperatorDestroy(&op_proj_rhs)); PetscFunctionReturn(PETSC_SUCCESS); } @@ -385,8 +376,9 @@ PetscErrorCode CreateStatisticCollectionOperator(Ceed ceed, User user, CeedData case STATEVAR_CONSERVATIVE: PetscCallCeed(ceed, CeedQFunctionCreateInterior(ceed, 1, ChildStatsCollection_Conserv, ChildStatsCollection_Conserv_loc, &qf_stats_collect)); break; - default: - SETERRQ(PetscObjectComm((PetscObject)user->dm), PETSC_ERR_SUP, "No statisics collection available for chosen state variable"); + case STATEVAR_ENTROPY: + PetscCallCeed(ceed, CeedQFunctionCreateInterior(ceed, 1, ChildStatsCollection_Entropy, ChildStatsCollection_Entropy_loc, &qf_stats_collect)); + break; } if (user->spanstats.do_mms_test) { @@ -405,9 +397,9 @@ PetscErrorCode CreateStatisticCollectionOperator(Ceed ceed, User user, CeedData PetscCallCeed(ceed, CeedQFunctionContextRegisterDouble(collect_context, "solution time", offsetof(struct Turbulence_SpanStatsContext_, solution_time), 1, "Current solution time")); - PetscCallCeed( - ceed, CeedQFunctionContextRegisterDouble(collect_context, "previous time", offsetof(struct Turbulence_SpanStatsContext_, previous_time), 1, - "Previous time statistics collection was done")); + PetscCallCeed(ceed, + CeedQFunctionContextRegisterDouble(collect_context, "previous time", offsetof(struct Turbulence_SpanStatsContext_, previous_time), + 1, "Previous time statistics collection was done")); PetscCallCeed(ceed, CeedQFunctionContextRestoreData(problem->apply_vol_rhs.qfunction_context, &newtonian_ig_ctx)); } @@ -431,7 +423,7 @@ PetscErrorCode CreateStatisticCollectionOperator(Ceed ceed, User user, CeedData PetscCall(OperatorApplyContextCreate(user->dm, user->spanstats.dm, user->ceed, op_stats_collect, user->q_ceed, NULL, NULL, NULL, &user->spanstats.op_stats_collect_ctx)); - PetscCall(CeedOperatorCreateLocalVecs(op_stats_collect, DMReturnVecType(user->spanstats.dm), PetscObjectComm((PetscObject)user->spanstats.dm), NULL, + PetscCall(CeedOperatorCreateLocalVecs(op_stats_collect, DMReturnVecType(user->spanstats.dm), PETSC_COMM_SELF, NULL, &user->spanstats.Child_Stats_loc)); PetscCall(VecZeroEntries(user->spanstats.Child_Stats_loc)); @@ -442,13 +434,18 @@ PetscErrorCode CreateStatisticCollectionOperator(Ceed ceed, User user, CeedData // Creates operator for calculating error of method of manufactured solution (MMS) test PetscErrorCode SetupMMSErrorChecking(Ceed ceed, User user, CeedData ceed_data, SpanStatsSetupData stats_data) { - CeedInt num_comp_stats = user->spanstats.num_comp_stats, num_comp_x, q_data_size; - CeedQFunction qf_error; - CeedOperator op_error; - CeedVector x_ceed, y_ceed; + CeedInt num_comp_stats = user->spanstats.num_comp_stats, num_comp_x, q_data_size; + CeedQFunction qf_error; + CeedOperator op_error; + CeedVector x_ceed, y_ceed; + DMLabel domain_label = NULL; + PetscInt label_value = 0; + CeedVector q_data; + CeedElemRestriction elem_restr_parent_qd; PetscFunctionBeginUser; - PetscCallCeed(ceed, CeedElemRestrictionGetNumComponents(stats_data->elem_restr_parent_qd, &q_data_size)); + PetscCall(QDataGet(ceed, user->spanstats.dm, domain_label, label_value, stats_data->elem_restr_parent_x, stats_data->basis_x, stats_data->x_coord, + &elem_restr_parent_qd, &q_data, &q_data_size)); PetscCallCeed(ceed, CeedBasisGetNumComponents(stats_data->basis_x, &num_comp_x)); PetscCallCeed(ceed, CeedQFunctionCreateInterior(ceed, 1, ChildStatsCollectionMMSTest_Error, ChildStatsCollectionMMSTest_Error_loc, &qf_error)); @@ -459,7 +456,7 @@ PetscErrorCode SetupMMSErrorChecking(Ceed ceed, User user, CeedData ceed_data, S PetscCallCeed(ceed, CeedOperatorCreate(ceed, qf_error, NULL, NULL, &op_error)); PetscCallCeed(ceed, CeedOperatorSetField(op_error, "q", stats_data->elem_restr_parent_stats, stats_data->basis_stats, CEED_VECTOR_ACTIVE)); - PetscCallCeed(ceed, CeedOperatorSetField(op_error, "qdata", stats_data->elem_restr_parent_qd, CEED_BASIS_NONE, stats_data->q_data)); + PetscCallCeed(ceed, CeedOperatorSetField(op_error, "qdata", elem_restr_parent_qd, CEED_BASIS_NONE, q_data)); PetscCallCeed(ceed, CeedOperatorSetField(op_error, "x", stats_data->elem_restr_parent_x, stats_data->basis_x, stats_data->x_coord)); PetscCallCeed(ceed, CeedOperatorSetField(op_error, "v", stats_data->elem_restr_parent_stats, stats_data->basis_stats, CEED_VECTOR_ACTIVE)); @@ -468,10 +465,12 @@ PetscErrorCode SetupMMSErrorChecking(Ceed ceed, User user, CeedData ceed_data, S PetscCall(OperatorApplyContextCreate(user->spanstats.dm, user->spanstats.dm, user->ceed, op_error, x_ceed, y_ceed, NULL, NULL, &user->spanstats.mms_error_ctx)); - PetscCallCeed(ceed, CeedOperatorDestroy(&op_error)); - PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_error)); + PetscCallCeed(ceed, CeedVectorDestroy(&q_data)); PetscCallCeed(ceed, CeedVectorDestroy(&x_ceed)); PetscCallCeed(ceed, CeedVectorDestroy(&y_ceed)); + PetscCallCeed(ceed, CeedElemRestrictionDestroy(&elem_restr_parent_qd)); + PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_error)); + PetscCallCeed(ceed, CeedOperatorDestroy(&op_error)); PetscFunctionReturn(PETSC_SUCCESS); } diff --git a/examples/fluids/src/velocity_gradient_projection.c b/examples/fluids/src/velocity_gradient_projection.c index 7b1f970d72..277da68ee1 100644 --- a/examples/fluids/src/velocity_gradient_projection.c +++ b/examples/fluids/src/velocity_gradient_projection.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -22,8 +22,8 @@ PetscErrorCode VelocityGradientProjectionCreateDM(NodalProjectionData grad_velo_ PetscCall(DMClone(user->dm, &grad_velo_proj->dm)); PetscCall(PetscObjectSetName((PetscObject)grad_velo_proj->dm, "Velocity Gradient Projection")); - PetscCall( - DMSetupByOrder_FEM(PETSC_TRUE, PETSC_TRUE, user->app_ctx->degree, 1, user->app_ctx->q_extra, 1, &grad_velo_proj->num_comp, grad_velo_proj->dm)); + PetscCall(DMSetupByOrder_FEM(PETSC_TRUE, PETSC_TRUE, user->app_ctx->degree, 1, user->app_ctx->q_extra, 1, &grad_velo_proj->num_comp, + grad_velo_proj->dm)); PetscCall(DMGetLocalSection(grad_velo_proj->dm, §ion)); PetscCall(PetscSectionSetFieldName(section, 0, "")); @@ -67,15 +67,17 @@ PetscErrorCode VelocityGradientProjectionSetup(Ceed ceed, User user, CeedData ce // -- Build RHS operator switch (state_var_input) { case STATEVAR_PRIMITIVE: - PetscCallCeed( - ceed, CeedQFunctionCreateInterior(ceed, 1, VelocityGradientProjectionRHS_Prim, VelocityGradientProjectionRHS_Prim_loc, &qf_rhs_assemble)); + PetscCallCeed(ceed, CeedQFunctionCreateInterior(ceed, 1, VelocityGradientProjectionRHS_Prim, VelocityGradientProjectionRHS_Prim_loc, + &qf_rhs_assemble)); break; case STATEVAR_CONSERVATIVE: PetscCallCeed(ceed, CeedQFunctionCreateInterior(ceed, 1, VelocityGradientProjectionRHS_Conserv, VelocityGradientProjectionRHS_Conserv_loc, &qf_rhs_assemble)); break; - default: - SETERRQ(PetscObjectComm((PetscObject)user->dm), PETSC_ERR_SUP, "No velocity gradient projection QFunction for chosen state variable"); + case STATEVAR_ENTROPY: + PetscCallCeed(ceed, CeedQFunctionCreateInterior(ceed, 1, VelocityGradientProjectionRHS_Entropy, VelocityGradientProjectionRHS_Entropy_loc, + &qf_rhs_assemble)); + break; } PetscCallCeed(ceed, CeedQFunctionSetContext(qf_rhs_assemble, problem->apply_vol_ifunction.qfunction_context)); @@ -105,7 +107,7 @@ PetscErrorCode VelocityGradientProjectionSetup(Ceed ceed, User user, CeedData ce Mat mat_mass; MPI_Comm comm = PetscObjectComm((PetscObject)grad_velo_proj->dm); - PetscCall(MatCeedCreate(grad_velo_proj->dm, grad_velo_proj->dm, op_mass, NULL, &mat_mass)); + PetscCall(MatCreateCeed(grad_velo_proj->dm, grad_velo_proj->dm, op_mass, NULL, &mat_mass)); PetscCall(KSPCreate(comm, &grad_velo_proj->ksp)); PetscCall(KSPSetOptionsPrefix(grad_velo_proj->ksp, "velocity_gradient_projection_")); diff --git a/examples/fluids/tests-output/fluids-navierstokes-gaussianwave-IDL-entropy.bin b/examples/fluids/tests-output/fluids-navierstokes-gaussianwave-IDL-entropy.bin new file mode 100644 index 0000000000..af70688040 Binary files /dev/null and b/examples/fluids/tests-output/fluids-navierstokes-gaussianwave-IDL-entropy.bin differ diff --git a/examples/fluids/tests-output/fluids-navierstokes-gaussianwave-IDL.bin b/examples/fluids/tests-output/fluids-navierstokes-gaussianwave-IDL.bin index facbebe2d6..9ae8647455 100644 Binary files a/examples/fluids/tests-output/fluids-navierstokes-gaussianwave-IDL.bin and b/examples/fluids/tests-output/fluids-navierstokes-gaussianwave-IDL.bin differ diff --git a/examples/fluids/tests-output/fluids-navierstokes-gaussianwave-explicit.bin b/examples/fluids/tests-output/fluids-navierstokes-gaussianwave-explicit.bin index bd8ea4f163..27826f39c2 100644 Binary files a/examples/fluids/tests-output/fluids-navierstokes-gaussianwave-explicit.bin and b/examples/fluids/tests-output/fluids-navierstokes-gaussianwave-explicit.bin differ diff --git a/examples/fluids/tests-output/fluids-navierstokes-gaussianwave-shell.bin b/examples/fluids/tests-output/fluids-navierstokes-gaussianwave-shell.bin index ab4052b1dc..5b9252ae28 100644 Binary files a/examples/fluids/tests-output/fluids-navierstokes-gaussianwave-shell.bin and b/examples/fluids/tests-output/fluids-navierstokes-gaussianwave-shell.bin differ diff --git a/examples/fluids/vortexshedding.yaml b/examples/fluids/vortexshedding.yaml index 9541a94c7e..9b910da328 100644 --- a/examples/fluids/vortexshedding.yaml +++ b/examples/fluids/vortexshedding.yaml @@ -33,7 +33,7 @@ outflow: freestream: velocity: 1,0,0 # Small gravity vector to break symmetry so shedding can start -g: 0,-.01,0 +gravity: 0,-.01,0 # viscosity corresponds to Reynolds number 100 mu: 0.01 @@ -44,11 +44,11 @@ degree: 3 dm_plex_filename: examples/fluids/meshes/cylinder-q1-n08.msh # Boundary Settings -bc_slip_z: 6 +bc_symmetry_z: 6 bc_wall: 5 bc_freestream: 1 bc_outflow: 2 -bc_slip_y: 3,4 +bc_symmetry_y: 3,4 wall_comps: 1,2,3 # Primitive variables are preferred at low Mach number diff --git a/examples/mfem/Makefile b/examples/mfem/Makefile index cb5abeba01..6b042926fe 100644 --- a/examples/mfem/Makefile +++ b/examples/mfem/Makefile @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors +# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors # All Rights Reserved. See the top-level LICENSE and NOTICE files for details. # # SPDX-License-Identifier: BSD-2-Clause diff --git a/examples/mfem/README.md b/examples/mfem/README.md new file mode 100644 index 0000000000..d6d2002177 --- /dev/null +++ b/examples/mfem/README.md @@ -0,0 +1,18 @@ +## libCEED MFEM Examples + +These examples show to write libCEED operators (BP1 and BP3) within the open-source finite element library [MFEM](https://www.mfem.org/). + +First compile MFEM and libCEED individually. After that, compile the MFEM example: + +```bash +export MFEM_DIR=/path/to/mfem +make +``` + +To run the executable, write: + +``` +./bp[1, 3] +``` + +Optional command-line arguments are shown by adding the command-line argument "--help". diff --git a/examples/mfem/bp1.cpp b/examples/mfem/bp1.cpp index f6a96dd536..096a6aeee7 100644 --- a/examples/mfem/bp1.cpp +++ b/examples/mfem/bp1.cpp @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause diff --git a/examples/mfem/bp1.h b/examples/mfem/bp1.h index 332340340f..3e6fe273c8 100644 --- a/examples/mfem/bp1.h +++ b/examples/mfem/bp1.h @@ -1,11 +1,11 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause // // This file is part of CEED: http://github.com/ceed -#include +#include /// A structure used to pass additional data to f_build_mass struct BuildContext { diff --git a/examples/mfem/bp1.hpp b/examples/mfem/bp1.hpp index cb43675b56..912346857c 100644 --- a/examples/mfem/bp1.hpp +++ b/examples/mfem/bp1.hpp @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause diff --git a/examples/mfem/bp3.cpp b/examples/mfem/bp3.cpp index d4b8eb24e9..779a75f3a2 100644 --- a/examples/mfem/bp3.cpp +++ b/examples/mfem/bp3.cpp @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -188,6 +188,7 @@ int main(int argc, char *argv[]) { delete fespace; delete fec; delete mesh; + delete D; CeedDestroy(&ceed); return 0; } diff --git a/examples/mfem/bp3.h b/examples/mfem/bp3.h index dde37b7446..bc73b3acab 100644 --- a/examples/mfem/bp3.h +++ b/examples/mfem/bp3.h @@ -1,11 +1,11 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause // // This file is part of CEED: http://github.com/ceed -#include +#include /// A structure used to pass additional data to f_build_diff and f_apply_diff struct BuildContext { diff --git a/examples/mfem/bp3.hpp b/examples/mfem/bp3.hpp index d9b74474d0..36b88b3697 100644 --- a/examples/mfem/bp3.hpp +++ b/examples/mfem/bp3.hpp @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause diff --git a/examples/nek/README.md b/examples/nek/README.md index 6c1cfdee44..1b8faec64d 100644 --- a/examples/nek/README.md +++ b/examples/nek/README.md @@ -2,7 +2,7 @@ ### Prerequisites -Nek5000 v18.0 or greater must be [installed](https://nek5000.mcs.anl.gov/getstarted/) to run these examples. +Nek5000 v19.0 or greater must be [installed](https://nek5000.mcs.anl.gov/getstarted/) to run these examples. It is assumed to exist at `../../../Nek5000` (a sibling to the libCEED directory) or at a path defined in the environment variable `NEK5K_DIR`. For example, you could set ```sh diff --git a/examples/nek/bps/bps.h b/examples/nek/bps/bps.h index 446377b044..5de48e9e34 100644 --- a/examples/nek/bps/bps.h +++ b/examples/nek/bps/bps.h @@ -1,15 +1,15 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause // // This file is part of CEED: http://github.com/ceed +#pragma once -#ifndef bps_h -#define bps_h - -#include +#include +#ifndef CEED_RUNNING_JIT_PASS #include +#endif #ifndef M_PI #define M_PI 3.14159265358979323846 @@ -110,5 +110,3 @@ CEED_QFUNCTION(diffusionf)(void *ctx, CeedInt Q, const CeedScalar *const *in, Ce } // End of Quadrature Point Loop return 0; } - -#endif // bps_h diff --git a/examples/nek/bps/bps.usr b/examples/nek/bps/bps.usr index 89f7fca164..f5021d78d9 100644 --- a/examples/nek/bps/bps.usr +++ b/examples/nek/bps/bps.usr @@ -1,4 +1,4 @@ -C Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors +C Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors C All Rights Reserved. See the top-level COPYRIGHT and NOTICE files for details. C C SPDX-License-Identifier: (BSD-2-Clause) diff --git a/examples/petsc/Makefile b/examples/petsc/Makefile index b465d25cfe..d66fd3176a 100644 --- a/examples/petsc/Makefile +++ b/examples/petsc/Makefile @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors +# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors # All Rights Reserved. See the top-level LICENSE and NOTICE files for details. # # SPDX-License-Identifier: BSD-2-Clause @@ -15,7 +15,7 @@ CEED_DIR ?= ../.. ceed.pc := $(CEED_DIR)/lib/pkgconfig/ceed.pc CC = $(call pkgconf, --variable=ccompiler $(PETSc.pc) $(ceed.pc)) -CFLAGS = -std=c99 \ +CFLAGS = -std=c11 \ $(call pkgconf, --variable=cflags_extra $(PETSc.pc)) \ $(call pkgconf, --cflags-only-other $(PETSc.pc)) \ $(OPT) diff --git a/examples/petsc/README.md b/examples/petsc/README.md index 4ec9e4baff..b63e7d0e98 100644 --- a/examples/petsc/README.md +++ b/examples/petsc/README.md @@ -1,6 +1,6 @@ ## libCEED + PETSc Examples -PETSc v3.17 or a development version of PETSc at commit 0e95d842 or later is required. +This page provides a description of the CEED bakeoff problem examples for the libCEED library, based on PETSc. ### CEED bakeoff problems with raw mesh management - bpsraw @@ -17,7 +17,6 @@ In addition to the common arguments, the following arguments may be set: ### CEED bakeoff problems with DMPlex - bps This code solves the CEED bakeoff problems on a unstructured grid using DMPlex. -This example requires a PETSc version later than 3.11.3. To build, run `make bps` @@ -43,7 +42,6 @@ The resulting log file can be read by the Python plotting scripts in `benchmarks ### CEED bakeoff problems with DMPlex and PCMG - multigrid This code solves the CEED bakeoff problems on a unstructured grid using DMPlex with p-multigrid implemented in PCMG. -This example requires a PETSc version later than 3.11.3. To build, run `make multigrid` diff --git a/examples/petsc/area.c b/examples/petsc/area.c index c72de7d6fa..1b146a4c21 100644 --- a/examples/petsc/area.c +++ b/examples/petsc/area.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -71,7 +71,7 @@ int main(int argc, char **argv) { Ceed ceed; CeedData ceed_data; ProblemType problem_choice; - VecType vec_type; + VecType vec_type = VECSTANDARD; PetscMemType mem_type; PetscCall(PetscInitialize(&argc, &argv, NULL, help)); @@ -110,15 +110,6 @@ int main(int argc, char **argv) { // Create DM PetscCall(SetupDMByDegree(dm, degree, q_extra, num_comp_u, topo_dim, false)); - // Create vectors - PetscCall(DMCreateGlobalVector(dm, &U)); - PetscCall(VecGetLocalSize(U, &l_size)); - PetscCall(VecGetSize(U, &g_size)); - PetscCall(DMCreateLocalVector(dm, &U_loc)); - PetscCall(VecGetSize(U_loc, &xl_size)); - PetscCall(VecDuplicate(U, &V)); - PetscCall(VecDuplicate(U_loc, &V_loc)); - // Setup op_apply_ctx structure PetscCall(PetscMalloc1(1, &op_apply_ctx)); @@ -127,23 +118,30 @@ int main(int argc, char **argv) { CeedMemType mem_type_backend; CeedGetPreferredMemType(ceed, &mem_type_backend); - PetscCall(DMGetVecType(dm, &vec_type)); - if (!vec_type) { // Not yet set by op_apply_ctx -dm_vec_type - switch (mem_type_backend) { - case CEED_MEM_HOST: - vec_type = VECSTANDARD; - break; - case CEED_MEM_DEVICE: { - const char *resolved; - CeedGetResource(ceed, &resolved); - if (strstr(resolved, "/gpu/cuda")) vec_type = VECCUDA; - else if (strstr(resolved, "/gpu/hip/occa")) vec_type = VECSTANDARD; // https://github.com/CEED/libCEED/issues/678 - else if (strstr(resolved, "/gpu/hip")) vec_type = VECHIP; - else vec_type = VECSTANDARD; - } + // Set mesh vec_type + switch (mem_type_backend) { + case CEED_MEM_HOST: + vec_type = VECSTANDARD; + break; + case CEED_MEM_DEVICE: { + const char *resolved; + + CeedGetResource(ceed, &resolved); + if (strstr(resolved, "/gpu/cuda")) vec_type = VECCUDA; + else if (strstr(resolved, "/gpu/hip")) vec_type = VECHIP; + else vec_type = VECSTANDARD; } - PetscCall(DMSetVecType(dm, vec_type)); } + PetscCall(DMSetVecType(dm, vec_type)); + + // Create vectors + PetscCall(DMCreateGlobalVector(dm, &U)); + PetscCall(VecGetLocalSize(U, &l_size)); + PetscCall(VecGetSize(U, &g_size)); + PetscCall(DMCreateLocalVector(dm, &U_loc)); + PetscCall(VecGetSize(U_loc, &xl_size)); + PetscCall(VecDuplicate(U, &V)); + PetscCall(VecDuplicate(U_loc, &V_loc)); // Print summary if (!test_mode) { @@ -168,7 +166,7 @@ int main(int argc, char **argv) { // Setup libCEED's objects and apply setup operator PetscCall(PetscMalloc1(1, &ceed_data)); PetscCall(SetupLibceedByDegree(dm, ceed, degree, topo_dim, q_extra, num_comp_x, num_comp_u, g_size, xl_size, problem_options[problem_choice], - ceed_data, false, (CeedVector)NULL, (CeedVector *)NULL)); + ceed_data, false, true, (CeedVector)NULL, (CeedVector *)NULL)); // Setup output vector PetscCall(VecZeroEntries(V_loc)); diff --git a/examples/petsc/area.h b/examples/petsc/area.h index 1b95f6d6df..fd36dd79df 100644 --- a/examples/petsc/area.h +++ b/examples/petsc/area.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause diff --git a/examples/petsc/bps.c b/examples/petsc/bps.c index 29101e1379..a00ee650c8 100644 --- a/examples/petsc/bps.c +++ b/examples/petsc/bps.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -25,7 +25,9 @@ // ./bps -problem bp6 -degree 3 -ceed /gpu/cuda // //TESTARGS(name="BP3, tet elements") -ceed {ceed_resource} -test -problem bp3 -degree 3 -ksp_max_it_clip 50,50 -simplex -//TESTARGS(name="BP5, hex elements") -ceed {ceed_resource} -test -problem bp5 -degree 3 -ksp_max_it_clip 15,15 +//TESTARGS(name="BP5, hex elements") -ceed {ceed_resource} -test -problem bp5 -degree 3 -ksp_max_it_clip 18,18 +//TESTARGS(name="BP1+3, hex elements") -ceed {ceed_resource} -test -problem bp1_3 -degree 3 -ksp_max_it_clip 18,18 +//TESTARGS(name="BP2+4, hex elements") -ceed {ceed_resource} -test -problem bp2_4 -degree 3 -ksp_max_it_clip 18,18 /// @file /// CEED BPs example using PETSc with DMPlex @@ -62,7 +64,7 @@ static PetscErrorCode RunWithDM(RunParams rp, DM dm, const char *ceed_resource) CeedQFunction qf_error; CeedOperator op_error; CeedVector rhs_ceed, target; - VecType vec_type; + VecType vec_type = VECSTANDARD; PetscMemType mem_type; PetscFunctionBeginUser; @@ -71,23 +73,22 @@ static PetscErrorCode RunWithDM(RunParams rp, DM dm, const char *ceed_resource) CeedMemType mem_type_backend; CeedGetPreferredMemType(ceed, &mem_type_backend); - PetscCall(DMGetVecType(dm, &vec_type)); - if (!vec_type) { // Not yet set by user -dm_vec_type - switch (mem_type_backend) { - case CEED_MEM_HOST: - vec_type = VECSTANDARD; - break; - case CEED_MEM_DEVICE: { - const char *resolved; - CeedGetResource(ceed, &resolved); - if (strstr(resolved, "/gpu/cuda")) vec_type = VECCUDA; - else if (strstr(resolved, "/gpu/hip/occa")) vec_type = VECSTANDARD; // https://github.com/CEED/libCEED/issues/678 - else if (strstr(resolved, "/gpu/hip")) vec_type = VECHIP; - else vec_type = VECSTANDARD; - } + // Set mesh vec_type + switch (mem_type_backend) { + case CEED_MEM_HOST: + vec_type = VECSTANDARD; + break; + case CEED_MEM_DEVICE: { + const char *resolved; + + CeedGetResource(ceed, &resolved); + if (strstr(resolved, "/gpu/cuda")) vec_type = VECCUDA; + else if (strstr(resolved, "/gpu/hip")) vec_type = VECHIP; + else vec_type = VECSTANDARD; } - PetscCall(DMSetVecType(dm, vec_type)); } + PetscCall(DMSetVecType(dm, vec_type)); + PetscCall(DMSetFromOptions(dm)); // Create global and local solution vectors PetscCall(DMCreateGlobalVector(dm, &X)); @@ -112,6 +113,15 @@ static PetscErrorCode RunWithDM(RunParams rp, DM dm, const char *ceed_resource) const char *used_resource; CeedGetResource(ceed, &used_resource); + bool is_combined_bp = rp->bp_choice > CEED_BP6; + char bp_name[6] = ""; + + if (is_combined_bp) { + PetscCall(PetscSNPrintf(bp_name, 6, "%d + %d", rp->bp_choice % 2 ? 2 : 1, rp->bp_choice - CEED_BP4)); + } else { + PetscCall(PetscSNPrintf(bp_name, 6, "%d", rp->bp_choice + 1)); + } + VecType vec_type; PetscCall(VecGetType(X, &vec_type)); @@ -123,7 +133,7 @@ static PetscErrorCode RunWithDM(RunParams rp, DM dm, const char *ceed_resource) PetscMPIInt comm_size; PetscCall(MPI_Comm_size(rp->comm, &comm_size)); PetscCall(PetscPrintf(rp->comm, - "\n-- CEED Benchmark Problem %" CeedInt_FMT " -- libCEED + PETSc --\n" + "\n-- CEED Benchmark Problem %s -- libCEED + PETSc --\n" " MPI:\n" " Hostname : %s\n" " Total ranks : %d\n" @@ -142,8 +152,8 @@ static PetscErrorCode RunWithDM(RunParams rp, DM dm, const char *ceed_resource) " Element topology : %s\n" " Owned nodes : %" PetscInt_FMT "\n" " DoF per node : %" PetscInt_FMT "\n", - rp->bp_choice + 1, rp->hostname, comm_size, rp->ranks_per_node, vec_type, used_resource, CeedMemTypes[mem_type_backend], P, - Q, rp->q_extra, g_size / rp->num_comp_u, c_end - c_start, CeedElemTopologies[elem_topo], l_size / rp->num_comp_u, + bp_name, rp->hostname, comm_size, rp->ranks_per_node, vec_type, used_resource, CeedMemTypes[mem_type_backend], P, Q, + rp->q_extra, g_size / rp->num_comp_u, c_end - c_start, CeedElemTopologies[elem_topo], l_size / rp->num_comp_u, rp->num_comp_u)); } @@ -155,7 +165,7 @@ static PetscErrorCode RunWithDM(RunParams rp, DM dm, const char *ceed_resource) PetscCall(PetscMalloc1(1, &ceed_data)); PetscCall(SetupLibceedByDegree(dm, ceed, rp->degree, rp->dim, rp->q_extra, rp->dim, rp->num_comp_u, g_size, xl_size, bp_options[rp->bp_choice], - ceed_data, true, rhs_ceed, &target)); + ceed_data, true, true, rhs_ceed, &target)); // Gather RHS PetscCall(VecC2P(rhs_ceed, mem_type, rhs_loc)); @@ -183,9 +193,10 @@ static PetscErrorCode RunWithDM(RunParams rp, DM dm, const char *ceed_resource) { PC pc; PetscCall(KSPGetPC(ksp, &pc)); - if (rp->bp_choice == CEED_BP1 || rp->bp_choice == CEED_BP2) { + if (rp->bp_choice == CEED_BP1 || rp->bp_choice == CEED_BP2 || rp->bp_choice == CEED_BP13 || rp->bp_choice == CEED_BP24 || + rp->bp_choice == CEED_BP15 || rp->bp_choice == CEED_BP26) { PetscCall(PCSetType(pc, PCJACOBI)); - if (rp->simplex) { + if (rp->simplex || rp->bp_choice == CEED_BP13 || rp->bp_choice == CEED_BP24 || rp->bp_choice == CEED_BP15 || rp->bp_choice == CEED_BP26) { PetscCall(PCJacobiSetType(pc, PC_JACOBI_DIAGONAL)); } else { PetscCall(PCJacobiSetType(pc, PC_JACOBI_ROWSUM)); @@ -255,7 +266,11 @@ static PetscErrorCode RunWithDM(RunParams rp, DM dm, const char *ceed_resource) PetscCall(SetupErrorOperatorCtx(rp->comm, dm, ceed, ceed_data, X_loc, op_error, op_error_ctx)); PetscScalar l2_error; PetscCall(ComputeL2Error(X, &l2_error, op_error_ctx)); - PetscReal tol = 5e-2; + // Tighter tol for BP1, BP2 + // Looser tol for BP3, BP4, BP5, and BP6 with extra for vector valued problems + // BP1+3 and BP2+4 follow the pattern for BP3 and BP4 + // BP1+5 and BP2+6 follow the pattern for BP5 and BP6 + PetscReal tol = rp->bp_choice < CEED_BP3 ? 5e-4 : (5e-2 + (rp->bp_choice % 2 == 1 ? 5e-3 : 0)); if (!rp->test_mode || l2_error > tol) { PetscCall(MPI_Allreduce(&my_rt, &rt_min, 1, MPI_DOUBLE, MPI_MIN, rp->comm)); PetscCall(MPI_Allreduce(&my_rt, &rt_max, 1, MPI_DOUBLE, MPI_MAX, rp->comm)); diff --git a/examples/petsc/bps.h b/examples/petsc/bps.h index 9100c8af47..95d4a4c644 100644 --- a/examples/petsc/bps.h +++ b/examples/petsc/bps.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -17,4 +17,4 @@ static const char *const mem_types[] = {"host", "device", "memType", "CEED_MEM_" typedef enum { COARSEN_UNIFORM = 0, COARSEN_LOGARITHMIC = 1 } CoarsenType; static const char *const coarsen_types[] = {"uniform", "logarithmic", "CoarsenType", "COARSEN", 0}; -static const char *const bp_types[] = {"bp1", "bp2", "bp3", "bp4", "bp5", "bp6", "BPType", "CEED_BP", 0}; +static const char *const bp_types[] = {"bp1", "bp2", "bp3", "bp4", "bp5", "bp6", "bp1_3", "bp2_4", "bp1_5", "bp2_6", "BPType", "CEED_BP", 0}; diff --git a/examples/petsc/bpsraw.c b/examples/petsc/bpsraw.c index 5e2cdac76f..5bb10f4bd7 100644 --- a/examples/petsc/bpsraw.c +++ b/examples/petsc/bpsraw.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -385,8 +385,8 @@ int main(int argc, char **argv) { PetscInt two = 2; ksp_max_it_clip[0] = 5; ksp_max_it_clip[1] = 20; - PetscCall( - PetscOptionsIntArray("-ksp_max_it_clip", "Min and max number of iterations to use during benchmarking", NULL, ksp_max_it_clip, &two, NULL)); + PetscCall(PetscOptionsIntArray("-ksp_max_it_clip", "Min and max number of iterations to use during benchmarking", NULL, ksp_max_it_clip, &two, + NULL)); PetscOptionsEnd(); P = degree + 1; Q = P + q_extra; @@ -403,9 +403,9 @@ int main(int argc, char **argv) { break; case CEED_MEM_DEVICE: { const char *resolved; + CeedGetResource(ceed, &resolved); if (strstr(resolved, "/gpu/cuda")) default_vec_type = VECCUDA; - else if (strstr(resolved, "/gpu/hip/occa")) default_vec_type = VECSTANDARD; // https://github.com/CEED/libCEED/issues/678 else if (strstr(resolved, "/gpu/hip")) default_vec_type = VECHIP; else default_vec_type = VECSTANDARD; } @@ -438,9 +438,6 @@ int main(int argc, char **argv) { PetscCall(VecSetFromOptions(X)); PetscCall(VecSetUp(X)); - // Set up libCEED - CeedInit(ceed_resource, &ceed); - // Print summary PetscInt gsize; @@ -509,8 +506,9 @@ int main(int argc, char **argv) { l_to_g_ind[here] = g_start[ir][jr][kr] + (ii * g_m_nodes[ir][jr][kr][1] + jj) * g_m_nodes[ir][jr][kr][2] + kk; if ((i_rank[0] == 0 && i == 0) || (i_rank[1] == 0 && j == 0) || (i_rank[2] == 0 && k == 0) || (i_rank[0] + 1 == p[0] && i + 1 == l_nodes[0]) || (i_rank[1] + 1 == p[1] && j + 1 == l_nodes[1]) || - (i_rank[2] + 1 == p[2] && k + 1 == l_nodes[2])) + (i_rank[2] + 1 == p[2] && k + 1 == l_nodes[2])) { continue; + } l_to_g_ind_0[l_0_count] = l_to_g_ind[here]; loc_ind[l_0_count++] = here; } @@ -768,8 +766,8 @@ int main(int argc, char **argv) { } } if (!test_mode) { - PetscCall( - PetscPrintf(comm, " DoFs/Sec in CG : %g (%g) million\n", 1e-6 * gsize * its / rt_max, 1e-6 * gsize * its / rt_min)); + PetscCall(PetscPrintf(comm, " DoFs/Sec in CG : %g (%g) million\n", 1e-6 * gsize * its / rt_max, + 1e-6 * gsize * its / rt_min)); } } @@ -798,21 +796,22 @@ int main(int argc, char **argv) { CeedVectorDestroy(&op_apply_ctx->y_ceed); CeedVectorDestroy(&op_apply_ctx->q_data); CeedVectorDestroy(&target); - CeedOperatorDestroy(&op_setup_geo); - CeedOperatorDestroy(&op_setup_rhs); - CeedOperatorDestroy(&op_apply); - CeedOperatorDestroy(&op_error); CeedElemRestrictionDestroy(&elem_restr_u); CeedElemRestrictionDestroy(&elem_restr_x); CeedElemRestrictionDestroy(&elem_restr_u_i); CeedElemRestrictionDestroy(&elem_restr_qd_i); + CeedBasisDestroy(&basis_u); + CeedBasisDestroy(&basis_x); CeedQFunctionDestroy(&qf_setup_geo); CeedQFunctionDestroy(&qf_setup_rhs); CeedQFunctionDestroy(&qf_apply); CeedQFunctionDestroy(&qf_error); - CeedBasisDestroy(&basis_u); - CeedBasisDestroy(&basis_x); + CeedOperatorDestroy(&op_setup_geo); + CeedOperatorDestroy(&op_setup_rhs); + CeedOperatorDestroy(&op_apply); + CeedOperatorDestroy(&op_error); CeedDestroy(&ceed); + PetscCall(PetscFree(op_apply_ctx)); return PetscFinalize(); } diff --git a/examples/petsc/bpssphere.c b/examples/petsc/bpssphere.c index d928a815c1..30489224b1 100644 --- a/examples/petsc/bpssphere.c +++ b/examples/petsc/bpssphere.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -64,7 +64,7 @@ int main(int argc, char **argv) { CeedOperator op_error; CeedVector rhs_ceed, target; BPType bp_choice; - VecType vec_type; + VecType vec_type = VECSTANDARD; PetscMemType mem_type; PetscCall(PetscInitialize(&argc, &argv, NULL, help)); @@ -92,6 +92,26 @@ int main(int argc, char **argv) { PetscCall(PetscOptionsBool("-simplex", "Use simplices, or tensor product cells", NULL, simplex, &simplex, NULL)); PetscOptionsEnd(); + // Set up libCEED + CeedInit(ceed_resource, &ceed); + CeedMemType mem_type_backend; + CeedGetPreferredMemType(ceed, &mem_type_backend); + + // Set mesh vec_type + switch (mem_type_backend) { + case CEED_MEM_HOST: + vec_type = VECSTANDARD; + break; + case CEED_MEM_DEVICE: { + const char *resolved; + + CeedGetResource(ceed, &resolved); + if (strstr(resolved, "/gpu/cuda")) vec_type = VECCUDA; + else if (strstr(resolved, "/gpu/hip")) vec_type = VECHIP; + else vec_type = VECSTANDARD; + } + } + // Setup DM if (read_mesh) { PetscCall(DMPlexCreateFromFile(PETSC_COMM_WORLD, filename, NULL, PETSC_TRUE, &dm)); @@ -104,6 +124,7 @@ int main(int argc, char **argv) { // Refine DMPlex with uniform refinement using runtime option -dm_refine PetscCall(DMPlexSetRefinementUniform(dm, PETSC_TRUE)); } + PetscCall(DMSetVecType(dm, vec_type)); PetscCall(DMSetFromOptions(dm)); // View DMPlex via runtime option PetscCall(DMViewFromOptions(dm, NULL, "-dm_view")); @@ -125,29 +146,6 @@ int main(int argc, char **argv) { PetscCall(MatCreateShell(comm, l_size, l_size, g_size, g_size, op_apply_ctx, &mat_O)); PetscCall(MatShellSetOperation(mat_O, MATOP_MULT, (void (*)(void))MatMult_Ceed)); - // Set up libCEED - CeedInit(ceed_resource, &ceed); - CeedMemType mem_type_backend; - CeedGetPreferredMemType(ceed, &mem_type_backend); - - PetscCall(DMGetVecType(dm, &vec_type)); - if (!vec_type) { // Not yet set by user -dm_vec_type - switch (mem_type_backend) { - case CEED_MEM_HOST: - vec_type = VECSTANDARD; - break; - case CEED_MEM_DEVICE: { - const char *resolved; - CeedGetResource(ceed, &resolved); - if (strstr(resolved, "/gpu/cuda")) vec_type = VECCUDA; - else if (strstr(resolved, "/gpu/hip/occa")) vec_type = VECSTANDARD; // https://github.com/CEED/libCEED/issues/678 - else if (strstr(resolved, "/gpu/hip")) vec_type = VECHIP; - else vec_type = VECSTANDARD; - } - } - PetscCall(DMSetVecType(dm, vec_type)); - } - // Print summary if (!test_mode) { PetscInt P = degree + 1, Q = P + q_extra; @@ -175,7 +173,7 @@ int main(int argc, char **argv) { // Setup libCEED's objects PetscCall(PetscMalloc1(1, &ceed_data)); PetscCall(SetupLibceedByDegree(dm, ceed, degree, topo_dim, q_extra, num_comp_x, num_comp_u, g_size, xl_size, bp_options[bp_choice], ceed_data, true, - rhs_ceed, &target)); + true, rhs_ceed, &target)); // Gather RHS PetscCall(VecC2P(rhs_ceed, mem_type, rhs_loc)); diff --git a/examples/petsc/bpssphere.h b/examples/petsc/bpssphere.h index c3c7678f54..c5d030bab8 100644 --- a/examples/petsc/bpssphere.h +++ b/examples/petsc/bpssphere.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause diff --git a/examples/petsc/bpsswarm.c b/examples/petsc/bpsswarm.c index 5796cf7471..e4ba5aed4c 100644 --- a/examples/petsc/bpsswarm.c +++ b/examples/petsc/bpsswarm.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -65,7 +65,7 @@ int main(int argc, char **argv) { CeedData ceed_data; CeedOperator op_error; BPType bp_choice; - VecType vec_type; + VecType vec_type = VECSTANDARD; PointSwarmType point_swarm_type = SWARM_GAUSS; PetscMPIInt ranks_per_node; char hostname[PETSC_MAX_PATH_LEN]; @@ -146,6 +146,26 @@ int main(int argc, char **argv) { } PetscOptionsEnd(); + // Set up libCEED + CeedInit(ceed_resource, &ceed); + CeedMemType mem_type_backend; + CeedGetPreferredMemType(ceed, &mem_type_backend); + + // Set background mesh vec_type + switch (mem_type_backend) { + case CEED_MEM_HOST: + vec_type = VECSTANDARD; + break; + case CEED_MEM_DEVICE: { + const char *resolved; + + CeedGetResource(ceed, &resolved); + if (strstr(resolved, "/gpu/cuda")) vec_type = VECCUDA; + else if (strstr(resolved, "/gpu/hip")) vec_type = VECHIP; + else vec_type = VECSTANDARD; + } + } + // Setup DM if (read_mesh) { PetscCall(DMPlexCreateFromFile(comm, filename, NULL, PETSC_TRUE, &dm_mesh)); @@ -162,11 +182,13 @@ int main(int argc, char **argv) { PetscCheck(!is_simplex, comm, PETSC_ERR_USER, "Only tensor-product background meshes supported"); } } + PetscCall(DMSetVecType(dm_mesh, vec_type)); + PetscCall(DMSetFromOptions(dm_mesh)); + PetscCall(DMGetDimension(dm_mesh, &dim)); PetscCall(SetupDMByDegree(dm_mesh, degree, q_extra, num_comp_u, dim, bp_options[bp_choice].enforce_bc)); // View mesh - PetscCall(DMSetOptionsPrefix(dm_mesh, "final_")); PetscCall(DMViewFromOptions(dm_mesh, NULL, "-dm_view")); // Create particle swarm @@ -209,29 +231,7 @@ int main(int argc, char **argv) { PetscCall(MatCreateShell(comm, l_size, l_size, g_size, g_size, op_apply_ctx, &mat_O)); PetscCall(MatSetDM(mat_O, dm_mesh)); PetscCall(MatShellSetOperation(mat_O, MATOP_MULT, (void (*)(void))MatMult_Ceed)); - - // Set up libCEED - CeedInit(ceed_resource, &ceed); - CeedMemType mem_type_backend; - CeedGetPreferredMemType(ceed, &mem_type_backend); - - PetscCall(DMGetVecType(dm_mesh, &vec_type)); - if (!vec_type) { // Not yet set by user -dm_vec_type - switch (mem_type_backend) { - case CEED_MEM_HOST: - vec_type = VECSTANDARD; - break; - case CEED_MEM_DEVICE: { - const char *resolved; - CeedGetResource(ceed, &resolved); - if (strstr(resolved, "/gpu/cuda")) vec_type = VECCUDA; - else if (strstr(resolved, "/gpu/hip/occa")) vec_type = VECSTANDARD; // https://github.com/CEED/libCEED/issues/678 - else if (strstr(resolved, "/gpu/hip")) vec_type = VECHIP; - else vec_type = VECSTANDARD; - } - } - PetscCall(DMSetVecType(dm_mesh, vec_type)); - } + PetscCall(MatShellSetOperation(mat_O, MATOP_GET_DIAGONAL, (void (*)(void))MatGetDiag)); // Print summary if (!test_mode) { @@ -302,14 +302,9 @@ int main(int argc, char **argv) { PetscCall(KSPGetPC(ksp, &pc)); if (bp_choice == CEED_BP1 || bp_choice == CEED_BP2) { PetscCall(PCSetType(pc, PCJACOBI)); - PetscCall(PCJacobiSetType(pc, PC_JACOBI_ROWSUM)); + PetscCall(PCJacobiSetType(pc, PC_JACOBI_DIAGONAL)); } else { PetscCall(PCSetType(pc, PCNONE)); - MatNullSpace nullspace; - - PetscCall(MatNullSpaceCreate(PETSC_COMM_WORLD, PETSC_TRUE, 0, 0, &nullspace)); - PetscCall(MatSetNullSpace(mat_O, nullspace)); - PetscCall(MatNullSpaceDestroy(&nullspace)); } PetscCall(KSPSetType(ksp, KSPCG)); PetscCall(KSPSetNormType(ksp, KSP_NORM_NATURAL)); diff --git a/examples/petsc/dmswarm.c b/examples/petsc/dmswarm.c index 557ace7ec2..fa95f16979 100644 --- a/examples/petsc/dmswarm.c +++ b/examples/petsc/dmswarm.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -18,7 +18,7 @@ // // ./dmswarm -dm_plex_dim 3 -dm_plex_box_faces 3,3,3 -dm_plex_box_lower -1.0,-1.0,-1.0 -dm_plex_simplex 0 -num_comp 2 -swarm gauss // -//TESTARGS(name="Uniform swarm, CG projection") -ceed {ceed_resource} -test -dm_plex_dim 3 -dm_plex_box_faces 3,3,3 -dm_plex_box_lower -1.0,-1.0,-1.0 -dm_plex_simplex 0 -dm_plex_hash_location true -num_comp 2 -swarm uniform -solution_order 3 -points_per_cell 125 +//TESTARGS(name="Uniform swarm, CG projection") -ceed {ceed_resource} -test -dm_plex_dim 3 -dm_plex_box_faces 3,3,3 -dm_plex_box_lower -1.0,-1.0,-1.0 -dm_plex_simplex 0 -dm_plex_hash_location true -num_comp 2 -swarm uniform -solution_order 3 -q_extra 0 -points_per_cell 125 //TESTARGS(name="Gauss swarm, lumped projection") -ceed {ceed_resource} -test -dm_plex_dim 3 -dm_plex_box_faces 3,3,3 -dm_plex_box_lower -1.0,-1.0,-1.0 -dm_plex_simplex 0 -dm_plex_hash_location true -num_comp 2 -swarm gauss -ksp_type preonly -pc_type jacobi -pc_jacobi_type rowsum -tolerance 9e-2 /// @file @@ -82,10 +82,10 @@ int main(int argc, char **argv) { PetscOptionsBegin(comm, NULL, "libCEED example using PETSc with DMSwarm", NULL); PetscCall(PetscOptionsBool("-test", "Testing mode (do not print unless error is large)", NULL, test_mode, &test_mode, NULL)); - PetscCall( - PetscOptionsBool("-u_petsc_swarm_view", "View XDMF of swarm values interpolated by PETSc", NULL, view_petsc_swarm, &view_petsc_swarm, NULL)); - PetscCall( - PetscOptionsBool("-u_ceed_swarm_view", "View XDMF of swarm values interpolated by libCEED", NULL, view_ceed_swarm, &view_ceed_swarm, NULL)); + PetscCall(PetscOptionsBool("-u_petsc_swarm_view", "View XDMF of swarm values interpolated by PETSc", NULL, view_petsc_swarm, &view_petsc_swarm, + NULL)); + PetscCall(PetscOptionsBool("-u_ceed_swarm_view", "View XDMF of swarm values interpolated by libCEED", NULL, view_ceed_swarm, &view_ceed_swarm, + NULL)); PetscCall(PetscOptionsEnum("-target", "Target field function", NULL, target_types, (PetscEnum)target_type, (PetscEnum *)&target_type, NULL)); PetscCall(PetscOptionsInt("-solution_order", "Order of mesh solution space", NULL, solution_order, &solution_order, NULL)); PetscCall(PetscOptionsInt("-mesh_order", "Order of mesh coordinate space", NULL, mesh_order, &mesh_order, NULL)); @@ -398,7 +398,7 @@ PetscErrorCode DMSwarmInterpolateFromCellToSwarm_Petsc(DM dm_swarm, const char * PetscCall(DMRestoreWorkArray(dm_mesh, num_points_in_cell * dim, MPIU_REAL, &coords_points_cell_true)); PetscCall(DMRestoreWorkArray(dm_mesh, num_points_in_cell * dim, MPIU_REAL, &coords_points_cell_ref)); PetscCall(PetscTabulationDestroy(&tabulation)); - PetscCall(PetscFree(points_cell)); + PetscCall(DMSwarmSortRestorePointsPerCell(dm_swarm, cell, &num_points_in_cell, &points_cell)); } // Cleanup @@ -486,7 +486,7 @@ PetscErrorCode DMSwarmCheckSwarmValues(DM dm_swarm, const char *field, PetscScal } // -- Cleanup - PetscCall(PetscFree(points)); + PetscCall(DMSwarmSortRestorePointsPerCell(dm_swarm, cell, &num_points_in_cell, &points)); } // Cleanup diff --git a/examples/petsc/include/areaproblemdata.h b/examples/petsc/include/areaproblemdata.h index cb5a254085..5820409159 100644 --- a/examples/petsc/include/areaproblemdata.h +++ b/examples/petsc/include/areaproblemdata.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause diff --git a/examples/petsc/include/bpsproblemdata.h b/examples/petsc/include/bpsproblemdata.h index f89aadc318..9525216d0f 100644 --- a/examples/petsc/include/bpsproblemdata.h +++ b/examples/petsc/include/bpsproblemdata.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -14,7 +14,9 @@ #include "../include/structs.h" #include "../qfunctions/bps/bp1.h" +#include "../qfunctions/bps/bp13.h" #include "../qfunctions/bps/bp2.h" +#include "../qfunctions/bps/bp24.h" #include "../qfunctions/bps/bp3.h" #include "../qfunctions/bps/bp4.h" #include "../qfunctions/bps/common.h" @@ -23,107 +25,175 @@ // BP Option Data // ----------------------------------------------------------------------------- -BPData bp_options[6] = { - [CEED_BP1] = {.num_comp_u = 1, - .num_comp_x = 3, - .topo_dim = 3, - .q_data_size = 1, - .q_extra = 1, - .setup_geo = SetupMassGeo, - .setup_rhs = SetupMassRhs, - .apply = Mass, - .error = Error, - .setup_geo_loc = SetupMassGeo_loc, - .setup_rhs_loc = SetupMassRhs_loc, - .apply_loc = Mass_loc, - .error_loc = Error_loc, - .in_mode = CEED_EVAL_INTERP, - .out_mode = CEED_EVAL_INTERP, - .q_mode = CEED_GAUSS, - .enforce_bc = PETSC_FALSE}, - [CEED_BP2] = {.num_comp_u = 3, - .num_comp_x = 3, - .topo_dim = 3, - .q_data_size = 1, - .q_extra = 1, - .setup_geo = SetupMassGeo, - .setup_rhs = SetupMassRhs3, - .apply = Mass3, - .error = Error3, - .setup_geo_loc = SetupMassGeo_loc, - .setup_rhs_loc = SetupMassRhs3_loc, - .apply_loc = Mass3_loc, - .error_loc = Error3_loc, - .in_mode = CEED_EVAL_INTERP, - .out_mode = CEED_EVAL_INTERP, - .q_mode = CEED_GAUSS, - .enforce_bc = PETSC_FALSE}, - [CEED_BP3] = {.num_comp_u = 1, - .num_comp_x = 3, - .topo_dim = 3, - .q_data_size = 7, - .q_extra = 1, - .setup_geo = SetupDiffGeo, - .setup_rhs = SetupDiffRhs, - .apply = Diff, - .error = Error, - .setup_geo_loc = SetupDiffGeo_loc, - .setup_rhs_loc = SetupDiffRhs_loc, - .apply_loc = Diff_loc, - .error_loc = Error_loc, - .in_mode = CEED_EVAL_GRAD, - .out_mode = CEED_EVAL_GRAD, - .q_mode = CEED_GAUSS, - .enforce_bc = PETSC_TRUE }, - [CEED_BP4] = {.num_comp_u = 3, - .num_comp_x = 3, - .topo_dim = 3, - .q_data_size = 7, - .q_extra = 1, - .setup_geo = SetupDiffGeo, - .setup_rhs = SetupDiffRhs3, - .apply = Diff3, - .error = Error3, - .setup_geo_loc = SetupDiffGeo_loc, - .setup_rhs_loc = SetupDiffRhs3_loc, - .apply_loc = Diff3_loc, - .error_loc = Error3_loc, - .in_mode = CEED_EVAL_GRAD, - .out_mode = CEED_EVAL_GRAD, - .q_mode = CEED_GAUSS, - .enforce_bc = PETSC_TRUE }, - [CEED_BP5] = {.num_comp_u = 1, - .num_comp_x = 3, - .topo_dim = 3, - .q_data_size = 7, - .q_extra = 0, - .setup_geo = SetupDiffGeo, - .setup_rhs = SetupDiffRhs, - .apply = Diff, - .error = Error, - .setup_geo_loc = SetupDiffGeo_loc, - .setup_rhs_loc = SetupDiffRhs_loc, - .apply_loc = Diff_loc, - .error_loc = Error_loc, - .in_mode = CEED_EVAL_GRAD, - .out_mode = CEED_EVAL_GRAD, - .q_mode = CEED_GAUSS_LOBATTO, - .enforce_bc = PETSC_TRUE }, - [CEED_BP6] = {.num_comp_u = 3, - .num_comp_x = 3, - .topo_dim = 3, - .q_data_size = 7, - .q_extra = 0, - .setup_geo = SetupDiffGeo, - .setup_rhs = SetupDiffRhs3, - .apply = Diff3, - .error = Error3, - .setup_geo_loc = SetupDiffGeo_loc, - .setup_rhs_loc = SetupDiffRhs3_loc, - .apply_loc = Diff3_loc, - .error_loc = Error3_loc, - .in_mode = CEED_EVAL_GRAD, - .out_mode = CEED_EVAL_GRAD, - .q_mode = CEED_GAUSS_LOBATTO, - .enforce_bc = PETSC_TRUE } +BPData bp_options[10] = { + [CEED_BP1] = {.num_comp_u = 1, + .num_comp_x = 3, + .topo_dim = 3, + .q_data_size = 1, + .q_extra = 1, + .setup_geo = SetupMassGeo, + .setup_rhs = SetupMassRhs, + .apply = Mass, + .error = Error, + .setup_geo_loc = SetupMassGeo_loc, + .setup_rhs_loc = SetupMassRhs_loc, + .apply_loc = Mass_loc, + .error_loc = Error_loc, + .in_mode = CEED_EVAL_INTERP, + .out_mode = CEED_EVAL_INTERP, + .q_mode = CEED_GAUSS, + .enforce_bc = PETSC_FALSE}, + [CEED_BP2] = {.num_comp_u = 3, + .num_comp_x = 3, + .topo_dim = 3, + .q_data_size = 1, + .q_extra = 1, + .setup_geo = SetupMassGeo, + .setup_rhs = SetupMassRhs3, + .apply = Mass3, + .error = Error3, + .setup_geo_loc = SetupMassGeo_loc, + .setup_rhs_loc = SetupMassRhs3_loc, + .apply_loc = Mass3_loc, + .error_loc = Error3_loc, + .in_mode = CEED_EVAL_INTERP, + .out_mode = CEED_EVAL_INTERP, + .q_mode = CEED_GAUSS, + .enforce_bc = PETSC_FALSE}, + [CEED_BP3] = {.num_comp_u = 1, + .num_comp_x = 3, + .topo_dim = 3, + .q_data_size = 7, + .q_extra = 1, + .setup_geo = SetupDiffGeo, + .setup_rhs = SetupDiffRhs, + .apply = Diff, + .error = Error, + .setup_geo_loc = SetupDiffGeo_loc, + .setup_rhs_loc = SetupDiffRhs_loc, + .apply_loc = Diff_loc, + .error_loc = Error_loc, + .in_mode = CEED_EVAL_GRAD, + .out_mode = CEED_EVAL_GRAD, + .q_mode = CEED_GAUSS, + .enforce_bc = PETSC_TRUE }, + [CEED_BP4] = {.num_comp_u = 3, + .num_comp_x = 3, + .topo_dim = 3, + .q_data_size = 7, + .q_extra = 1, + .setup_geo = SetupDiffGeo, + .setup_rhs = SetupDiffRhs3, + .apply = Diff3, + .error = Error3, + .setup_geo_loc = SetupDiffGeo_loc, + .setup_rhs_loc = SetupDiffRhs3_loc, + .apply_loc = Diff3_loc, + .error_loc = Error3_loc, + .in_mode = CEED_EVAL_GRAD, + .out_mode = CEED_EVAL_GRAD, + .q_mode = CEED_GAUSS, + .enforce_bc = PETSC_TRUE }, + [CEED_BP5] = {.num_comp_u = 1, + .num_comp_x = 3, + .topo_dim = 3, + .q_data_size = 7, + .q_extra = 0, + .setup_geo = SetupDiffGeo, + .setup_rhs = SetupDiffRhs, + .apply = Diff, + .error = Error, + .setup_geo_loc = SetupDiffGeo_loc, + .setup_rhs_loc = SetupDiffRhs_loc, + .apply_loc = Diff_loc, + .error_loc = Error_loc, + .in_mode = CEED_EVAL_GRAD, + .out_mode = CEED_EVAL_GRAD, + .q_mode = CEED_GAUSS_LOBATTO, + .enforce_bc = PETSC_TRUE }, + [CEED_BP6] = {.num_comp_u = 3, + .num_comp_x = 3, + .topo_dim = 3, + .q_data_size = 7, + .q_extra = 0, + .setup_geo = SetupDiffGeo, + .setup_rhs = SetupDiffRhs3, + .apply = Diff3, + .error = Error3, + .setup_geo_loc = SetupDiffGeo_loc, + .setup_rhs_loc = SetupDiffRhs3_loc, + .apply_loc = Diff3_loc, + .error_loc = Error3_loc, + .in_mode = CEED_EVAL_GRAD, + .out_mode = CEED_EVAL_GRAD, + .q_mode = CEED_GAUSS_LOBATTO, + .enforce_bc = PETSC_TRUE }, + [CEED_BP13] = {.num_comp_u = 1, + .num_comp_x = 3, + .topo_dim = 3, + .q_data_size = 7, + .q_extra = 1, + .setup_geo = SetupDiffGeo, + .setup_rhs = SetupMassDiffRhs, + .apply = MassDiff, + .error = Error, + .setup_geo_loc = SetupDiffGeo_loc, + .setup_rhs_loc = SetupMassDiffRhs_loc, + .apply_loc = MassDiff_loc, + .error_loc = Error_loc, + .in_mode = CEED_EVAL_INTERP + CEED_EVAL_GRAD, + .out_mode = CEED_EVAL_INTERP + CEED_EVAL_GRAD, + .q_mode = CEED_GAUSS, + .enforce_bc = PETSC_TRUE }, + [CEED_BP24] = {.num_comp_u = 3, + .num_comp_x = 3, + .topo_dim = 3, + .q_data_size = 7, + .q_extra = 1, + .setup_geo = SetupDiffGeo, + .setup_rhs = SetupMassDiffRhs3, + .apply = MassDiff3, + .error = Error3, + .setup_geo_loc = SetupDiffGeo_loc, + .setup_rhs_loc = SetupMassDiffRhs3_loc, + .apply_loc = MassDiff3_loc, + .error_loc = Error3_loc, + .in_mode = CEED_EVAL_INTERP + CEED_EVAL_GRAD, + .out_mode = CEED_EVAL_INTERP + CEED_EVAL_GRAD, + .q_mode = CEED_GAUSS, + .enforce_bc = PETSC_TRUE }, + [CEED_BP15] = {.num_comp_u = 1, + .num_comp_x = 3, + .topo_dim = 3, + .q_data_size = 7, + .q_extra = 0, + .setup_geo = SetupDiffGeo, + .setup_rhs = SetupMassDiffRhs, + .apply = MassDiff, + .error = Error, + .setup_geo_loc = SetupDiffGeo_loc, + .setup_rhs_loc = SetupMassDiffRhs_loc, + .apply_loc = MassDiff_loc, + .error_loc = Error_loc, + .in_mode = CEED_EVAL_INTERP + CEED_EVAL_GRAD, + .out_mode = CEED_EVAL_INTERP + CEED_EVAL_GRAD, + .q_mode = CEED_GAUSS_LOBATTO, + .enforce_bc = PETSC_TRUE }, + [CEED_BP26] = {.num_comp_u = 3, + .num_comp_x = 3, + .topo_dim = 3, + .q_data_size = 7, + .q_extra = 0, + .setup_geo = SetupDiffGeo, + .setup_rhs = SetupMassDiffRhs3, + .apply = MassDiff3, + .error = Error3, + .setup_geo_loc = SetupDiffGeo_loc, + .setup_rhs_loc = SetupMassDiffRhs3_loc, + .apply_loc = MassDiff3_loc, + .error_loc = Error3_loc, + .in_mode = CEED_EVAL_INTERP + CEED_EVAL_GRAD, + .out_mode = CEED_EVAL_INTERP + CEED_EVAL_GRAD, + .q_mode = CEED_GAUSS_LOBATTO, + .enforce_bc = PETSC_TRUE }, }; diff --git a/examples/petsc/include/libceedsetup.h b/examples/petsc/include/libceedsetup.h index 611c30eb9a..c87130e923 100644 --- a/examples/petsc/include/libceedsetup.h +++ b/examples/petsc/include/libceedsetup.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -16,8 +16,8 @@ PetscErrorCode CeedDataDestroy(CeedInt i, CeedData data); PetscErrorCode SetupLibceedByDegree(DM dm, Ceed ceed, CeedInt degree, CeedInt topo_dim, CeedInt q_extra, PetscInt num_comp_x, PetscInt num_comp_u, - PetscInt g_size, PetscInt xl_size, BPData bp_data, CeedData data, PetscBool setup_rhs, CeedVector rhs_ceed, - CeedVector *target); + PetscInt g_size, PetscInt xl_size, BPData bp_data, CeedData data, PetscBool setup_rhs, PetscBool is_fine_level, + CeedVector rhs_ceed, CeedVector *target); PetscErrorCode CeedLevelTransferSetup(DM dm, Ceed ceed, CeedInt level, CeedInt num_comp_u, CeedData *data, BPData bp_data, Vec fine_mult); PetscErrorCode SetupErrorOperator(DM dm, Ceed ceed, BPData bp_data, CeedInt topo_dim, PetscInt num_comp_x, PetscInt num_comp_u, CeedOperator *op_error); diff --git a/examples/petsc/include/matops.h b/examples/petsc/include/matops.h index 8c29f9e76a..d9e03b6f6d 100644 --- a/examples/petsc/include/matops.h +++ b/examples/petsc/include/matops.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause diff --git a/examples/petsc/include/petscutils.h b/examples/petsc/include/petscutils.h index b1b07f7672..0f1f5d0ad6 100644 --- a/examples/petsc/include/petscutils.h +++ b/examples/petsc/include/petscutils.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause diff --git a/examples/petsc/include/petscversion.h b/examples/petsc/include/petscversion.h index 8c1d3f92be..426aeae2ba 100644 --- a/examples/petsc/include/petscversion.h +++ b/examples/petsc/include/petscversion.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -9,6 +9,6 @@ /// Petsc version check #pragma once -#if PETSC_VERSION_LT(3, 21, 0) -#error "PETSc v3.21 or later is required" +#if PETSC_VERSION_LT(3, 23, 0) +#error "PETSc v3.23 or later is required" #endif diff --git a/examples/petsc/include/sphereproblemdata.h b/examples/petsc/include/sphereproblemdata.h index 5142d9eeba..4a63deea05 100644 --- a/examples/petsc/include/sphereproblemdata.h +++ b/examples/petsc/include/sphereproblemdata.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause diff --git a/examples/petsc/include/structs.h b/examples/petsc/include/structs.h index c33ad80b9b..8b2647fe16 100644 --- a/examples/petsc/include/structs.h +++ b/examples/petsc/include/structs.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -65,7 +65,18 @@ typedef struct { } BPData; // BP options -typedef enum { CEED_BP1 = 0, CEED_BP2 = 1, CEED_BP3 = 2, CEED_BP4 = 3, CEED_BP5 = 4, CEED_BP6 = 5 } BPType; +typedef enum { + CEED_BP1 = 0, + CEED_BP2 = 1, + CEED_BP3 = 2, + CEED_BP4 = 3, + CEED_BP5 = 4, + CEED_BP6 = 5, + CEED_BP13 = 6, + CEED_BP24 = 7, + CEED_BP15 = 8, + CEED_BP26 = 9, +} BPType; // ----------------------------------------------------------------------------- // Parameter structure for running problems diff --git a/examples/petsc/include/swarmutils.h b/examples/petsc/include/swarmutils.h index 0eeff6e301..4beed9bef1 100644 --- a/examples/petsc/include/swarmutils.h +++ b/examples/petsc/include/swarmutils.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause diff --git a/examples/petsc/multigrid.c b/examples/petsc/multigrid.c index e9f78197a3..1bce6a318a 100644 --- a/examples/petsc/multigrid.c +++ b/examples/petsc/multigrid.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -117,19 +117,21 @@ int main(int argc, char **argv) { if (read_mesh) { PetscCall(DMPlexCreateFromFile(PETSC_COMM_WORLD, filename, NULL, PETSC_TRUE, &dm_orig)); } else { - PetscCall(DMPlexCreateBoxMesh(PETSC_COMM_WORLD, dim, simplex, mesh_elem, NULL, NULL, NULL, PETSC_TRUE, &dm_orig)); + PetscCall(DMPlexCreateBoxMesh(PETSC_COMM_WORLD, dim, simplex, mesh_elem, NULL, NULL, NULL, PETSC_TRUE, 0, PETSC_FALSE, &dm_orig)); } - VecType vec_type; + // Set mesh vec_type + VecType vec_type = VECSTANDARD; + switch (mem_type_backend) { case CEED_MEM_HOST: vec_type = VECSTANDARD; break; case CEED_MEM_DEVICE: { const char *resolved; + CeedGetResource(ceed, &resolved); if (strstr(resolved, "/gpu/cuda")) vec_type = VECCUDA; - else if (strstr(resolved, "/gpu/hip/occa")) vec_type = VECSTANDARD; // https://github.com/CEED/libCEED/issues/678 else if (strstr(resolved, "/gpu/hip")) vec_type = VECHIP; else vec_type = VECSTANDARD; } @@ -155,10 +157,10 @@ int main(int argc, char **argv) { switch (coarsen) { case COARSEN_UNIFORM: - for (int i = 0; i < num_levels; i++) level_degrees[i] = i + 1; + for (PetscInt i = 0; i < num_levels; i++) level_degrees[i] = i + 1; break; case COARSEN_LOGARITHMIC: - for (int i = 0; i < num_levels - 1; i++) level_degrees[i] = pow(2, i); + for (PetscInt i = 0; i < num_levels - 1; i++) level_degrees[i] = pow(2, i); level_degrees[fine_level] = degree; break; } @@ -181,7 +183,7 @@ int main(int argc, char **argv) { CeedElemTopology elem_topo = ElemTopologyP2C(cell_type); // Setup DM and Operator Mat Shells for each level - for (CeedInt i = 0; i < num_levels; i++) { + for (PetscInt i = 0; i < num_levels; i++) { // Create DM PetscCall(DMClone(dm_orig, &dm[i])); PetscCall(DMGetVecType(dm_orig, &vec_type)); @@ -199,7 +201,6 @@ int main(int argc, char **argv) { // Operator PetscCall(PetscMalloc1(1, &op_apply_ctx[i])); - PetscCall(PetscMalloc1(1, &op_error_ctx)); PetscCall(MatCreateShell(comm, l_size[i], l_size[i], g_size[i], g_size[i], op_apply_ctx[i], &mat_O[i])); PetscCall(MatShellSetOperation(mat_O[i], MATOP_MULT, (void (*)(void))MatMult_Ceed)); PetscCall(MatShellSetOperation(mat_O[i], MATOP_GET_DIAGONAL, (void (*)(void))MatGetDiag)); @@ -267,7 +268,7 @@ int main(int argc, char **argv) { } PetscCall(PetscMalloc1(1, &ceed_data[i])); PetscCall(SetupLibceedByDegree(dm[i], ceed, level_degrees[i], dim, q_extra, dim, num_comp_u, g_size[i], xl_size[i], bp_options[bp_choice], - ceed_data[i], i == (fine_level), rhs_ceed, &target)); + ceed_data[i], i == fine_level, i == fine_level, rhs_ceed, &target)); } // Gather RHS @@ -291,7 +292,7 @@ int main(int argc, char **argv) { CeedOperatorSetField(op_error, "error", ceed_data[fine_level]->elem_restr_u, ceed_data[fine_level]->basis_u, CEED_VECTOR_ACTIVE); // Calculate multiplicity - for (int i = 0; i < num_levels; i++) { + for (PetscInt i = 0; i < num_levels; i++) { PetscMemType mem_type; // CEED vector @@ -322,7 +323,7 @@ int main(int argc, char **argv) { } // Set up Mat - for (int i = 0; i < num_levels; i++) { + for (PetscInt i = fine_level; i >= 0; i--) { // Set up apply operator context PetscCall(SetupApplyOperatorCtx(comm, dm[i], ceed, ceed_data[i], X_loc[i], op_apply_ctx[i])); @@ -335,8 +336,8 @@ int main(int argc, char **argv) { pr_restr_ctx[i]->loc_vec_c = X_loc[i - 1]; pr_restr_ctx[i]->loc_vec_f = op_apply_ctx[i]->Y_loc; pr_restr_ctx[i]->mult_vec = mult[i]; - pr_restr_ctx[i]->ceed_vec_c = op_apply_ctx[i - 1]->x_ceed; - pr_restr_ctx[i]->ceed_vec_f = op_apply_ctx[i]->y_ceed; + pr_restr_ctx[i]->ceed_vec_c = ceed_data[i - 1]->x_ceed; + pr_restr_ctx[i]->ceed_vec_f = ceed_data[i]->y_ceed; pr_restr_ctx[i]->op_prolong = ceed_data[i]->op_prolong; pr_restr_ctx[i]->op_restrict = ceed_data[i]->op_restrict; pr_restr_ctx[i]->ceed = ceed; @@ -393,7 +394,7 @@ int main(int argc, char **argv) { // PCMG levels PetscCall(PCMGSetLevels(pc, num_levels, NULL)); - for (int i = 0; i < num_levels; i++) { + for (PetscInt i = 0; i < num_levels; i++) { // Smoother KSP smoother; PC smoother_pc; @@ -502,6 +503,7 @@ int main(int argc, char **argv) { } { // Set up error operator context + PetscCall(PetscMalloc1(1, &op_error_ctx)); PetscCall(SetupErrorOperatorCtx(comm, dm[fine_level], ceed, ceed_data[fine_level], X_loc[fine_level], op_error, op_error_ctx)); PetscScalar l2_error; PetscCall(ComputeL2Error(X[fine_level], &l2_error, op_error_ctx)); @@ -532,7 +534,7 @@ int main(int argc, char **argv) { } // Cleanup - for (int i = 0; i < num_levels; i++) { + for (PetscInt i = 0; i < num_levels; i++) { PetscCall(VecDestroy(&X[i])); PetscCall(VecDestroy(&X_loc[i])); PetscCall(VecDestroy(&mult[i])); diff --git a/examples/petsc/qfunctions/area/areacube.h b/examples/petsc/qfunctions/area/areacube.h index 93be0594b6..f008846f2a 100644 --- a/examples/petsc/qfunctions/area/areacube.h +++ b/examples/petsc/qfunctions/area/areacube.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -8,8 +8,10 @@ /// @file /// libCEED QFunctions for mass operator example for a scalar field on the sphere using PETSc -#include +#include +#ifndef CEED_RUNNING_JIT_PASS #include +#endif // ----------------------------------------------------------------------------- // This QFunction sets up the geometric factor required for integration when reference coordinates have a different dimension than the one of physical diff --git a/examples/petsc/qfunctions/area/areasphere.h b/examples/petsc/qfunctions/area/areasphere.h index 7cd73ca354..13e5536e14 100644 --- a/examples/petsc/qfunctions/area/areasphere.h +++ b/examples/petsc/qfunctions/area/areasphere.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -8,8 +8,10 @@ /// @file /// libCEED QFunctions for mass operator example for a scalar field on the sphere using PETSc -#include +#include +#ifndef CEED_RUNNING_JIT_PASS #include +#endif // ----------------------------------------------------------------------------- // This QFunction sets up the geometric factor required for integration when reference coordinates have a different dimension than the one of physical diff --git a/examples/petsc/qfunctions/bps/bp1.h b/examples/petsc/qfunctions/bps/bp1.h index a902b29f7c..fb35d0249e 100644 --- a/examples/petsc/qfunctions/bps/bp1.h +++ b/examples/petsc/qfunctions/bps/bp1.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -8,8 +8,10 @@ /// @file /// libCEED QFunctions for mass operator example using PETSc -#include +#include +#ifndef CEED_RUNNING_JIT_PASS #include +#endif // ----------------------------------------------------------------------------- // This QFunction sets up the geometric factors required to apply the mass operator diff --git a/examples/petsc/qfunctions/bps/bp13.h b/examples/petsc/qfunctions/bps/bp13.h new file mode 100644 index 0000000000..33d454546d --- /dev/null +++ b/examples/petsc/qfunctions/bps/bp13.h @@ -0,0 +1,74 @@ +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. +// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +// +// SPDX-License-Identifier: BSD-2-Clause +// +// This file is part of CEED: http://github.com/ceed + +/// @file +/// libCEED QFunctions for diffusion operator example using PETSc + +#include +#ifndef CEED_RUNNING_JIT_PASS +#include +#endif + +// ----------------------------------------------------------------------------- +// This QFunction sets up the rhs and true solution for the problem +// ----------------------------------------------------------------------------- +CEED_QFUNCTION(SetupMassDiffRhs)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { +#ifndef M_PI +#define M_PI 3.14159265358979323846 +#endif + const CeedScalar *x = in[0], *w = in[1]; + CeedScalar *true_soln = out[0], *rhs = out[1]; + + // Quadrature Point Loop + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { + const CeedScalar c[3] = {0, 1., 2.}; + const CeedScalar k[3] = {1., 2., 3.}; + + true_soln[i] = sin(M_PI * (c[0] + k[0] * x[i + Q * 0])) * sin(M_PI * (c[1] + k[1] * x[i + Q * 1])) * sin(M_PI * (c[2] + k[2] * x[i + Q * 2])); + + rhs[i] = w[i + Q * 0] * (M_PI * M_PI * (k[0] * k[0] + k[1] * k[1] + k[2] * k[2]) + 1.0) * true_soln[i]; + } // End of Quadrature Point Loop + return 0; +} + +// ----------------------------------------------------------------------------- +// This QFunction applies the mass + diffusion operator for a scalar field. +// +// Inputs: +// u - Input vector at quadrature points +// ug - Input vector gradient at quadrature points +// q_data - Geometric factors +// +// Output: +// v - Output vector (test functions) at quadrature points +// vg - Output vector (test functions) gradient at quadrature points +// ----------------------------------------------------------------------------- +CEED_QFUNCTION(MassDiff)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { + const CeedScalar *u = in[0], *ug = in[1], *q_data = in[2]; + CeedScalar *v = out[0], *vg = out[1]; + + // Quadrature Point Loop + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { + // Read spatial derivatives of u + const CeedScalar du[3] = {ug[i + Q * 0], ug[i + Q * 1], ug[i + Q * 2]}; + // Read q_data (dXdxdXdx_T symmetric matrix) + const CeedScalar dXdxdXdx_T[3][3] = { + {q_data[i + 1 * Q], q_data[i + 2 * Q], q_data[i + 3 * Q]}, + {q_data[i + 2 * Q], q_data[i + 4 * Q], q_data[i + 5 * Q]}, + {q_data[i + 3 * Q], q_data[i + 5 * Q], q_data[i + 6 * Q]} + }; + + // Mass + v[i] = q_data[i + 0 * Q] * u[i]; + // Diff + for (int j = 0; j < 3; j++) { // j = direction of vg + vg[i + j * Q] = (du[0] * dXdxdXdx_T[0][j] + du[1] * dXdxdXdx_T[1][j] + du[2] * dXdxdXdx_T[2][j]); + } + } // End of Quadrature Point Loop + return 0; +} +// ----------------------------------------------------------------------------- diff --git a/examples/petsc/qfunctions/bps/bp1sphere.h b/examples/petsc/qfunctions/bps/bp1sphere.h index d604406f29..394d3d6cae 100644 --- a/examples/petsc/qfunctions/bps/bp1sphere.h +++ b/examples/petsc/qfunctions/bps/bp1sphere.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -8,8 +8,10 @@ /// @file /// libCEED QFunctions for mass operator example for a scalar field on the sphere using PETSc -#include +#include +#ifndef CEED_RUNNING_JIT_PASS #include +#endif // ----------------------------------------------------------------------------- // This QFunction sets up the geometric factors required for integration and coordinate transformations when reference coordinates have a different diff --git a/examples/petsc/qfunctions/bps/bp2.h b/examples/petsc/qfunctions/bps/bp2.h index 22ba9fb788..21da3ec39a 100644 --- a/examples/petsc/qfunctions/bps/bp2.h +++ b/examples/petsc/qfunctions/bps/bp2.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -8,8 +8,10 @@ /// @file /// libCEED QFunctions for mass operator example using PETSc -#include +#include +#ifndef CEED_RUNNING_JIT_PASS #include +#endif // ----------------------------------------------------------------------------- // This QFunction sets up the rhs and true solution for the problem diff --git a/examples/petsc/qfunctions/bps/bp24.h b/examples/petsc/qfunctions/bps/bp24.h new file mode 100644 index 0000000000..4870cd1cfe --- /dev/null +++ b/examples/petsc/qfunctions/bps/bp24.h @@ -0,0 +1,92 @@ +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. +// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +// +// SPDX-License-Identifier: BSD-2-Clause +// +// This file is part of CEED: http://github.com/ceed + +/// @file +/// libCEED QFunctions for diffusion operator example using PETSc + +#include +#ifndef CEED_RUNNING_JIT_PASS +#include +#endif + +// ----------------------------------------------------------------------------- +// This QFunction sets up the rhs and true solution for the problem +// ----------------------------------------------------------------------------- +CEED_QFUNCTION(SetupMassDiffRhs3)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { +#ifndef M_PI +#define M_PI 3.14159265358979323846 +#endif + const CeedScalar *x = in[0], *w = in[1]; + CeedScalar *true_soln = out[0], *rhs = out[1]; + + // Quadrature Point Loop + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { + const CeedScalar c[3] = {0, 1., 2.}; + const CeedScalar k[3] = {1., 2., 3.}; + + // Component 1 + true_soln[i + 0 * Q] = + sin(M_PI * (c[0] + k[0] * x[i + Q * 0])) * sin(M_PI * (c[1] + k[1] * x[i + Q * 1])) * sin(M_PI * (c[2] + k[2] * x[i + Q * 2])); + // Component 2 + true_soln[i + 1 * Q] = 2 * true_soln[i + 0 * Q]; + // Component 3 + true_soln[i + 2 * Q] = 3 * true_soln[i + 0 * Q]; + + // Component 1 + rhs[i + 0 * Q] = w[i + Q * 0] * (M_PI * M_PI * (k[0] * k[0] + k[1] * k[1] + k[2] * k[2]) + 1.0) * true_soln[i + 0 * Q]; + // Component 2 + rhs[i + 1 * Q] = 2 * rhs[i + 0 * Q]; + // Component 3 + rhs[i + 2 * Q] = 3 * rhs[i + 0 * Q]; + } // End of Quadrature Point Loop + return 0; +} + +// ----------------------------------------------------------------------------- +// This QFunction applies the mass + diffusion operator for a vector field of 3 components. +// +// Inputs: +// u - Input vector at quadrature points +// ug - Input vector Jacobian at quadrature points +// q_data - Geometric factors +// +// Output: +// v - Output vector (test functions) at quadrature points +// vJ - Output vector (test functions) Jacobian at quadrature points +// ----------------------------------------------------------------------------- +CEED_QFUNCTION(MassDiff3)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { + const CeedScalar *u = in[0], *ug = in[1], *q_data = in[2]; + CeedScalar *v = out[0], *vg = out[1]; + + // Quadrature Point Loop + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { + // Read spatial derivatives of u components + const CeedScalar uJ[3][3] = { + {ug[i + (0 + 0 * 3) * Q], ug[i + (0 + 1 * 3) * Q], ug[i + (0 + 2 * 3) * Q]}, + {ug[i + (1 + 0 * 3) * Q], ug[i + (1 + 1 * 3) * Q], ug[i + (1 + 2 * 3) * Q]}, + {ug[i + (2 + 0 * 3) * Q], ug[i + (2 + 1 * 3) * Q], ug[i + (2 + 2 * 3) * Q]} + }; + // Read q_data (dXdxdXdx_T symmetric matrix) + const CeedScalar dXdxdXdx_T[3][3] = { + {q_data[i + 1 * Q], q_data[i + 2 * Q], q_data[i + 3 * Q]}, + {q_data[i + 2 * Q], q_data[i + 4 * Q], q_data[i + 5 * Q]}, + {q_data[i + 3 * Q], q_data[i + 5 * Q], q_data[i + 6 * Q]} + }; + + for (int k = 0; k < 3; k++) { // k = component + // Mass + v[i + k * Q] = q_data[i + 0 * Q] * u[i + k * Q]; + // Diff + for (int j = 0; j < 3; j++) { // j = direction of vg + vg[i + (k + j * 3) * Q] = (uJ[k][0] * dXdxdXdx_T[0][j] + uJ[k][1] * dXdxdXdx_T[1][j] + uJ[k][2] * dXdxdXdx_T[2][j]); + } + } + } // End of Quadrature Point Loop + + return 0; +} +// ----------------------------------------------------------------------------- diff --git a/examples/petsc/qfunctions/bps/bp2sphere.h b/examples/petsc/qfunctions/bps/bp2sphere.h index 36a8e95778..aa08525c86 100644 --- a/examples/petsc/qfunctions/bps/bp2sphere.h +++ b/examples/petsc/qfunctions/bps/bp2sphere.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -8,8 +8,10 @@ /// @file /// libCEED QFunctions for mass operator example for a vector field on the sphere using PETSc -#include +#include +#ifndef CEED_RUNNING_JIT_PASS #include +#endif // ----------------------------------------------------------------------------- // This QFunction sets up the rhs and true solution for the problem diff --git a/examples/petsc/qfunctions/bps/bp3.h b/examples/petsc/qfunctions/bps/bp3.h index dcf84defae..153ad6e021 100644 --- a/examples/petsc/qfunctions/bps/bp3.h +++ b/examples/petsc/qfunctions/bps/bp3.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -8,8 +8,10 @@ /// @file /// libCEED QFunctions for diffusion operator example using PETSc -#include +#include +#ifndef CEED_RUNNING_JIT_PASS #include +#endif // ----------------------------------------------------------------------------- // This QFunction sets up the geometric factors required to apply the diffusion operator @@ -87,7 +89,6 @@ CEED_QFUNCTION(SetupDiffRhs)(void *ctx, CeedInt Q, const CeedScalar *const *in, rhs[i] = w[i + Q * 0] * M_PI * M_PI * (k[0] * k[0] + k[1] * k[1] + k[2] * k[2]) * true_soln[i]; } // End of Quadrature Point Loop - return 0; } diff --git a/examples/petsc/qfunctions/bps/bp3sphere.h b/examples/petsc/qfunctions/bps/bp3sphere.h index 1f901dd97a..911e14d0ac 100644 --- a/examples/petsc/qfunctions/bps/bp3sphere.h +++ b/examples/petsc/qfunctions/bps/bp3sphere.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -8,8 +8,10 @@ /// @file /// libCEED QFunctions for diffusion operator example for a scalar field on the sphere using PETSc -#include +#include +#ifndef CEED_RUNNING_JIT_PASS #include +#endif // ----------------------------------------------------------------------------- // This QFunction sets up the geometric factors required for integration and coordinate transformations when reference coordinates have a different diff --git a/examples/petsc/qfunctions/bps/bp4.h b/examples/petsc/qfunctions/bps/bp4.h index 46307c338a..0ccad57d68 100644 --- a/examples/petsc/qfunctions/bps/bp4.h +++ b/examples/petsc/qfunctions/bps/bp4.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -8,8 +8,10 @@ /// @file /// libCEED QFunctions for diffusion operator example using PETSc -#include +#include +#ifndef CEED_RUNNING_JIT_PASS #include +#endif // ----------------------------------------------------------------------------- // This QFunction sets up the rhs and true solution for the problem @@ -41,7 +43,6 @@ CEED_QFUNCTION(SetupDiffRhs3)(void *ctx, CeedInt Q, const CeedScalar *const *in, // Component 3 rhs[i + 2 * Q] = 3 * rhs[i + 0 * Q]; } // End of Quadrature Point Loop - return 0; } @@ -56,7 +57,7 @@ CEED_QFUNCTION(SetupDiffRhs3)(void *ctx, CeedInt Q, const CeedScalar *const *in, // vJ - Output vector (test functions) Jacobian at quadrature points // ----------------------------------------------------------------------------- CEED_QFUNCTION(Diff3)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { - const CeedScalar *ug = in[0], *qd = in[1]; + const CeedScalar *ug = in[0], *q_data = in[1]; CeedScalar *vg = out[0]; // Quadrature Point Loop @@ -69,9 +70,9 @@ CEED_QFUNCTION(Diff3)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedSca }; // Read q_data (dXdxdXdx_T symmetric matrix) const CeedScalar dXdxdXdx_T[3][3] = { - {qd[i + 1 * Q], qd[i + 2 * Q], qd[i + 3 * Q]}, - {qd[i + 2 * Q], qd[i + 4 * Q], qd[i + 5 * Q]}, - {qd[i + 3 * Q], qd[i + 5 * Q], qd[i + 6 * Q]} + {q_data[i + 1 * Q], q_data[i + 2 * Q], q_data[i + 3 * Q]}, + {q_data[i + 2 * Q], q_data[i + 4 * Q], q_data[i + 5 * Q]}, + {q_data[i + 3 * Q], q_data[i + 5 * Q], q_data[i + 6 * Q]} }; for (int k = 0; k < 3; k++) { // k = component @@ -80,7 +81,6 @@ CEED_QFUNCTION(Diff3)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedSca } } } // End of Quadrature Point Loop - return 0; } // ----------------------------------------------------------------------------- diff --git a/examples/petsc/qfunctions/bps/bp4sphere.h b/examples/petsc/qfunctions/bps/bp4sphere.h index 517f353371..43b4806afe 100644 --- a/examples/petsc/qfunctions/bps/bp4sphere.h +++ b/examples/petsc/qfunctions/bps/bp4sphere.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -8,8 +8,10 @@ /// @file /// libCEED QFunctions for mass operator example for a vector field on the sphere using PETSc -#include +#include +#ifndef CEED_RUNNING_JIT_PASS #include +#endif // ----------------------------------------------------------------------------- // This QFunction sets up the rhs and true solution for the problem diff --git a/examples/petsc/qfunctions/bps/common.h b/examples/petsc/qfunctions/bps/common.h index 26f374d5d4..09cccd5840 100644 --- a/examples/petsc/qfunctions/bps/common.h +++ b/examples/petsc/qfunctions/bps/common.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -8,7 +8,7 @@ /// @file /// libCEED QFunctions for BP examples using PETSc -#include +#include // ----------------------------------------------------------------------------- CEED_QFUNCTION(Error)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { diff --git a/examples/petsc/qfunctions/swarm/swarmmass.h b/examples/petsc/qfunctions/swarm/swarmmass.h index e355eff8d7..1b6fa1e21c 100644 --- a/examples/petsc/qfunctions/swarm/swarmmass.h +++ b/examples/petsc/qfunctions/swarm/swarmmass.h @@ -1,11 +1,11 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause // // This file is part of CEED: http://github.com/ceed -#include +#include CEED_QFUNCTION(SetupMass)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { const CeedScalar(*J)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[0]; diff --git a/examples/petsc/src/libceedsetup.c b/examples/petsc/src/libceedsetup.c index 086fb669c7..8f8323e7a6 100644 --- a/examples/petsc/src/libceedsetup.c +++ b/examples/petsc/src/libceedsetup.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -40,14 +40,14 @@ PetscErrorCode CeedDataDestroy(CeedInt i, CeedData data) { // Set up libCEED for a given degree // ----------------------------------------------------------------------------- PetscErrorCode SetupLibceedByDegree(DM dm, Ceed ceed, CeedInt degree, CeedInt topo_dim, CeedInt q_extra, PetscInt num_comp_x, PetscInt num_comp_u, - PetscInt g_size, PetscInt xl_size, BPData bp_data, CeedData data, PetscBool setup_rhs, CeedVector rhs_ceed, - CeedVector *target) { + PetscInt g_size, PetscInt xl_size, BPData bp_data, CeedData data, PetscBool setup_rhs, PetscBool is_fine_level, + CeedVector rhs_ceed, CeedVector *target) { DM dm_coord; Vec coords; const PetscScalar *coord_array; CeedBasis basis_x, basis_u; CeedElemRestriction elem_restr_x, elem_restr_u, elem_restr_u_i, elem_restr_qd_i; - CeedQFunction qf_setup_geo, qf_apply; + CeedQFunction qf_setup_geo = NULL, qf_apply = NULL; CeedOperator op_setup_geo, op_apply; CeedVector x_coord, q_data, x_ceed, y_ceed; PetscInt c_start, c_end, num_elem; @@ -86,36 +86,64 @@ PetscErrorCode SetupLibceedByDegree(DM dm, Ceed ceed, CeedInt degree, CeedInt to CeedVectorCreate(ceed, xl_size, &x_ceed); CeedVectorCreate(ceed, xl_size, &y_ceed); - // Create the QFunction that builds the context data - CeedQFunctionCreateInterior(ceed, 1, bp_data.setup_geo, bp_data.setup_geo_loc, &qf_setup_geo); - CeedQFunctionAddInput(qf_setup_geo, "x", num_comp_x, CEED_EVAL_INTERP); - CeedQFunctionAddInput(qf_setup_geo, "dx", num_comp_x * topo_dim, CEED_EVAL_GRAD); - CeedQFunctionAddInput(qf_setup_geo, "weight", 1, CEED_EVAL_WEIGHT); - CeedQFunctionAddOutput(qf_setup_geo, "qdata", q_data_size, CEED_EVAL_NONE); - - // Create the operator that builds the quadrature data - CeedOperatorCreate(ceed, qf_setup_geo, NULL, NULL, &op_setup_geo); - CeedOperatorSetField(op_setup_geo, "x", elem_restr_x, basis_x, CEED_VECTOR_ACTIVE); - CeedOperatorSetField(op_setup_geo, "dx", elem_restr_x, basis_x, CEED_VECTOR_ACTIVE); - CeedOperatorSetField(op_setup_geo, "weight", CEED_ELEMRESTRICTION_NONE, basis_x, CEED_VECTOR_NONE); - CeedOperatorSetField(op_setup_geo, "qdata", elem_restr_qd_i, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE); - - // Setup q_data - CeedOperatorApply(op_setup_geo, x_coord, q_data, CEED_REQUEST_IMMEDIATE); - - // Set up PDE operator - CeedInt in_scale = bp_data.in_mode == CEED_EVAL_GRAD ? topo_dim : 1; - CeedInt out_scale = bp_data.out_mode == CEED_EVAL_GRAD ? topo_dim : 1; - CeedQFunctionCreateInterior(ceed, 1, bp_data.apply, bp_data.apply_loc, &qf_apply); - CeedQFunctionAddInput(qf_apply, "u", num_comp_u * in_scale, bp_data.in_mode); - CeedQFunctionAddInput(qf_apply, "qdata", q_data_size, CEED_EVAL_NONE); - CeedQFunctionAddOutput(qf_apply, "v", num_comp_u * out_scale, bp_data.out_mode); + if (is_fine_level) { + // Create the QFunction that builds the context data + CeedQFunctionCreateInterior(ceed, 1, bp_data.setup_geo, bp_data.setup_geo_loc, &qf_setup_geo); + CeedQFunctionAddInput(qf_setup_geo, "x", num_comp_x, CEED_EVAL_INTERP); + CeedQFunctionAddInput(qf_setup_geo, "dx", num_comp_x * topo_dim, CEED_EVAL_GRAD); + CeedQFunctionAddInput(qf_setup_geo, "weight", 1, CEED_EVAL_WEIGHT); + CeedQFunctionAddOutput(qf_setup_geo, "qdata", q_data_size, CEED_EVAL_NONE); + + // Create the operator that builds the quadrature data + CeedOperatorCreate(ceed, qf_setup_geo, NULL, NULL, &op_setup_geo); + CeedOperatorSetField(op_setup_geo, "x", elem_restr_x, basis_x, CEED_VECTOR_ACTIVE); + CeedOperatorSetField(op_setup_geo, "dx", elem_restr_x, basis_x, CEED_VECTOR_ACTIVE); + CeedOperatorSetField(op_setup_geo, "weight", CEED_ELEMRESTRICTION_NONE, basis_x, CEED_VECTOR_NONE); + CeedOperatorSetField(op_setup_geo, "qdata", elem_restr_qd_i, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE); + + // Setup q_data + CeedOperatorApply(op_setup_geo, x_coord, q_data, CEED_REQUEST_IMMEDIATE); + + // Set up PDE operator + PetscBool is_interp = bp_data.in_mode == CEED_EVAL_INTERP; + CeedInt in_scale = bp_data.in_mode == CEED_EVAL_GRAD ? topo_dim : 1; + CeedInt out_scale = bp_data.out_mode == CEED_EVAL_GRAD ? topo_dim : 1; + + CeedQFunctionCreateInterior(ceed, 1, bp_data.apply, bp_data.apply_loc, &qf_apply); + if (bp_data.in_mode == CEED_EVAL_INTERP + CEED_EVAL_GRAD) { + CeedQFunctionAddInput(qf_apply, "u", num_comp_u, CEED_EVAL_INTERP); + CeedQFunctionAddInput(qf_apply, "du", num_comp_u * topo_dim, CEED_EVAL_GRAD); + } else { + CeedQFunctionAddInput(qf_apply, is_interp ? "u" : "du", num_comp_u * in_scale, bp_data.in_mode); + } + CeedQFunctionAddInput(qf_apply, "qdata", q_data_size, CEED_EVAL_NONE); + if (bp_data.out_mode == CEED_EVAL_INTERP + CEED_EVAL_GRAD) { + CeedQFunctionAddOutput(qf_apply, "v", num_comp_u, CEED_EVAL_INTERP); + CeedQFunctionAddOutput(qf_apply, "dv", num_comp_u * topo_dim, CEED_EVAL_GRAD); + } else { + CeedQFunctionAddOutput(qf_apply, is_interp ? "v" : "dv", num_comp_u * out_scale, bp_data.out_mode); + } + + // Create the mass or diff operator + CeedOperatorCreate(ceed, qf_apply, NULL, NULL, &op_apply); + if (bp_data.in_mode == CEED_EVAL_INTERP + CEED_EVAL_GRAD) { + CeedOperatorSetField(op_apply, "u", elem_restr_u, basis_u, CEED_VECTOR_ACTIVE); + CeedOperatorSetField(op_apply, "du", elem_restr_u, basis_u, CEED_VECTOR_ACTIVE); + } else { + CeedOperatorSetField(op_apply, is_interp ? "u" : "du", elem_restr_u, basis_u, CEED_VECTOR_ACTIVE); + } + CeedOperatorSetField(op_apply, "qdata", elem_restr_qd_i, CEED_BASIS_NONE, q_data); + if (bp_data.out_mode == CEED_EVAL_INTERP + CEED_EVAL_GRAD) { + CeedOperatorSetField(op_apply, "v", elem_restr_u, basis_u, CEED_VECTOR_ACTIVE); + CeedOperatorSetField(op_apply, "dv", elem_restr_u, basis_u, CEED_VECTOR_ACTIVE); + } else { + CeedOperatorSetField(op_apply, is_interp ? "v" : "dv", elem_restr_u, basis_u, CEED_VECTOR_ACTIVE); + } - // Create the mass or diff operator - CeedOperatorCreate(ceed, qf_apply, NULL, NULL, &op_apply); - CeedOperatorSetField(op_apply, "u", elem_restr_u, basis_u, CEED_VECTOR_ACTIVE); - CeedOperatorSetField(op_apply, "qdata", elem_restr_qd_i, CEED_BASIS_NONE, q_data); - CeedOperatorSetField(op_apply, "v", elem_restr_u, basis_u, CEED_VECTOR_ACTIVE); + // Cleanup + CeedQFunctionDestroy(&qf_setup_geo); + CeedOperatorDestroy(&op_setup_geo); + } // Set up RHS if needed if (setup_rhs) { @@ -151,10 +179,7 @@ PetscErrorCode SetupLibceedByDegree(DM dm, Ceed ceed, CeedInt degree, CeedInt to CeedQFunctionDestroy(&qf_setup_rhs); CeedOperatorDestroy(&op_setup_rhs); } - // Cleanup - CeedQFunctionDestroy(&qf_setup_geo); - CeedOperatorDestroy(&op_setup_geo); CeedVectorDestroy(&x_coord); // Save libCEED data required for level diff --git a/examples/petsc/src/petscutils.c b/examples/petsc/src/petscutils.c index e8fdc4fac3..1c4076ed10 100644 --- a/examples/petsc/src/petscutils.c +++ b/examples/petsc/src/petscutils.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -425,7 +425,7 @@ PetscErrorCode CreateDistributedDM(RunParams rp, DM *dm) { } } - PetscCall(DMPlexCreateBoxMesh(PETSC_COMM_WORLD, rp->dim, rp->simplex, rp->mesh_elem, NULL, NULL, NULL, PETSC_TRUE, dm)); + PetscCall(DMPlexCreateBoxMesh(PETSC_COMM_WORLD, rp->dim, rp->simplex, rp->mesh_elem, NULL, NULL, NULL, PETSC_TRUE, 0, PETSC_FALSE, dm)); } PetscCall(DMSetFromOptions(*dm)); diff --git a/examples/petsc/src/swarmutils.c b/examples/petsc/src/swarmutils.c index f736581ee5..21339ae9d6 100644 --- a/examples/petsc/src/swarmutils.c +++ b/examples/petsc/src/swarmutils.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -391,7 +391,7 @@ PetscErrorCode DMSwarmCreateReferenceCoordinates(DM dm_swarm, IS *is_points, Vec } // -- Cleanup - PetscCall(PetscFree(points_in_cell)); + PetscCall(DMSwarmSortRestorePointsPerCell(dm_swarm, cell, &num_points_in_cell, &points_in_cell)); } cell_points[points_offset - 1] = num_points_local + points_offset; @@ -617,6 +617,7 @@ PetscErrorCode SetupProblemSwarm(DM dm_swarm, Ceed ceed, BPData bp_data, CeedDat // Swarm objects { const PetscInt *cell_points; + CeedInt *offsets; IS is_points; Vec X_ref; CeedInt num_elem; @@ -628,7 +629,7 @@ PetscErrorCode SetupProblemSwarm(DM dm_swarm, Ceed ceed, BPData bp_data, CeedDat PetscCall(ISGetIndices(is_points, &cell_points)); PetscInt num_points = cell_points[num_elem + 1] - num_elem - 2; - CeedInt offsets[num_elem + 1 + num_points]; + PetscCall(PetscCalloc1(num_elem + 1 + num_points, &offsets)); for (PetscInt i = 0; i < num_elem + 1; i++) offsets[i] = cell_points[i + 1] - 1; for (PetscInt i = num_elem + 1; i < num_points + num_elem + 1; i++) offsets[i] = cell_points[i + 1]; @@ -685,6 +686,7 @@ PetscErrorCode SetupProblemSwarm(DM dm_swarm, Ceed ceed, BPData bp_data, CeedDat // Cleanup PetscCall(ISDestroy(&is_points)); + PetscCall(PetscFree(offsets)); PetscCall(VecDestroy(&X_ref)); } diff --git a/examples/python/Makefile b/examples/python/Makefile new file mode 100644 index 0000000000..64244ea2c1 --- /dev/null +++ b/examples/python/Makefile @@ -0,0 +1,20 @@ +# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. +# All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +# +# SPDX-License-Identifier: BSD-2-Clause +# +# This file is part of CEED: http://github.com/ceed + +PYTHON ?= python3 + +clean: + rm -rf build __pycache__ .pytest_cache *.so + +setup: + $(PYTHON) setup_qfunctions.py build + +TEST_OPTS ?= --ceed /cpu/self/ref/serial +test: setup + $(PYTHON) -m pytest ex_test.py $(TEST_OPTS) + +.PHONY: clean setup test diff --git a/examples/python/README.md b/examples/python/README.md new file mode 100644 index 0000000000..ca0019ab94 --- /dev/null +++ b/examples/python/README.md @@ -0,0 +1,25 @@ +## libCEED Python Examples + +These examples are written using libCEED's Python interface. + +### Tutorials + +These Jupyter notebooks explore the concepts of the libCEED API, including how to install the Python interface and the usage of each API object, with interactive examples. + +### Basic Examples + +The basic libCEED C examples in the folder `/examples/ceed` are also available as Python examples. + +To build the QFunctions into a shared library that the Python examples use, run + +```bash +make setup +``` + +To execute the examples, run: + +``` +python ex1_volume.py +``` + +A full list of command-line arguments are shown by adding the command-line argument "--help". diff --git a/examples/python/conftest.py b/examples/python/conftest.py new file mode 100644 index 0000000000..70bdf69cfc --- /dev/null +++ b/examples/python/conftest.py @@ -0,0 +1,25 @@ +# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors +# All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +# +# SPDX-License-Identifier: BSD-2-Clause +# +# This file is part of CEED: http://github.com/ceed + +import pytest + +# ------------------------------------------------------------------------------- +# Add --ceed command line argument +# ------------------------------------------------------------------------------- + + +def pytest_addoption(parser): + parser.addoption("--ceed", action="store", default='/cpu/self/ref/blocked') + + +@pytest.fixture(scope='session') +def ceed_resource(request): + ceed_resource = request.config.option.ceed + + return ceed_resource + +# ------------------------------------------------------------------------------- diff --git a/examples/python/ex1_volume.py b/examples/python/ex1_volume.py new file mode 100644 index 0000000000..b08b7e34e3 --- /dev/null +++ b/examples/python/ex1_volume.py @@ -0,0 +1,181 @@ +#!/usr/bin/env python3 +# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. +# All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +# +# SPDX-License-Identifier: BSD-2-Clause +# +# This file is part of CEED: http://github.com/ceed +# +# libCEED example using diffusion operator to compute surface area +# +# Sample runs: +# +# python ex1_volume.py +# python ex1_volume -c /cpu/self +# python ex1_volume -c /gpu/cuda + +import sys +import os +import numpy as np +import libceed +import ex_common as common + + +def main(): + """Main function for volume example""" + args = common.parse_arguments() + return example_1(args) + + +def example_1(args): + """Compute volume using mass operator + + Args: + args: Parsed command line arguments + + Returns: + int: 0 on success, error code on failure + """ + # Process arguments + dim = args.dim + mesh_degree = max(args.mesh_degree, args.solution_degree) + sol_degree = args.solution_degree + num_qpts = args.quadrature_points + problem_size = args.problem_size if args.problem_size > 0 else (8 * 16 if args.test else 256 * 1024) + ncomp_x = dim # Number of coordinate components + + # Print configuration + if not args.quiet: + print("Selected options: [command line option] : ") + print(f" Ceed specification [-c] : {args.ceed}") + print(f" Mesh dimension [-d] : {dim}") + print(f" Mesh degree [-m] : {mesh_degree}") + print(f" Solution degree [-p] : {sol_degree}") + print(f" Num. 1D quadr. pts [-q] : {num_qpts}") + print(f" Approx. # unknowns [-s] : {problem_size}") + print(f" QFunction source [-g] : {'gallery' if args.gallery else 'user'}") + + # Initialize CEED + ceed = libceed.Ceed(args.ceed) + + # Create bases + # Tensor-product Lagrange basis for mesh coordinates + mesh_basis = ceed.BasisTensorH1Lagrange( + dim, ncomp_x, mesh_degree + 1, num_qpts, libceed.GAUSS) + + # Tensor-product Lagrange basis for solution + solution_basis = ceed.BasisTensorH1Lagrange( + dim, 1, sol_degree + 1, num_qpts, libceed.GAUSS) + + # Create mesh + # Determine mesh size + num_xyz = common.get_cartesian_mesh_size(dim, sol_degree, problem_size) + if not args.quiet: + print("\nMesh size : nx = %d" % num_xyz[0], end="") + if dim > 1: + print(", ny = %d" % num_xyz[1], end="") + if dim > 2: + print(", nz = %d" % num_xyz[2], end="") + print() + + # Create element restrictions + num_q_comp = 1 + mesh_restriction, mesh_size, _, _, _ = common.build_cartesian_restriction( + ceed, dim, num_xyz, mesh_degree, ncomp_x, num_q_comp, num_qpts, create_qdata=False) + solution_restriction, sol_size, q_data_restriction, num_elem, elem_qpts = common.build_cartesian_restriction( + ceed, dim, num_xyz, sol_degree, 1, num_q_comp, num_qpts, create_qdata=True) + + if not args.quiet: + print("Number of mesh nodes : %d" % (mesh_size // dim)) + print("Number of solution nodes : %d" % sol_size) + + # Create and transform mesh coordinates + mesh_coords = ceed.Vector(mesh_size) + common.set_cartesian_mesh_coords(ceed, dim, num_xyz, mesh_degree, mesh_coords) + exact_volume, _ = common.transform_mesh_coords(dim, mesh_size, mesh_coords) + + # Create the QFunction that builds the mass operator (i.e. computes its quadrature data) and set its context data + qf_build = None + if args.gallery: + qf_build = ceed.QFunctionByName(f"Mass{dim}DBuild") + else: + build_ctx = ceed.QFunctionContext() + ctx_data = np.array([dim, dim], dtype=np.int32) + build_ctx.set_data(ctx_data) + + qfs_so = common.load_qfs_so() + file_dir = os.path.dirname(os.path.abspath(__file__)) + + qf_build = ceed.QFunction(1, qfs_so.build_mass, + os.path.join(file_dir, "ex1-volume.h:build_mass")) + qf_build.add_input("dx", dim * dim, libceed.EVAL_GRAD) + qf_build.add_input("weights", 1, libceed.EVAL_WEIGHT) + qf_build.add_output("qdata", num_q_comp, libceed.EVAL_NONE) + qf_build.set_context(build_ctx) + + # Create the operator that builds the quadrature data for the mass operator + op_build = ceed.Operator(qf_build) + op_build.set_field("dx", mesh_restriction, mesh_basis, libceed.VECTOR_ACTIVE) + op_build.set_field("weights", libceed.ELEMRESTRICTION_NONE, mesh_basis, libceed.VECTOR_NONE) + op_build.set_field("qdata", q_data_restriction, libceed.BASIS_NONE, libceed.VECTOR_ACTIVE) + + # Compute the quadrature data for the mass operator + q_data = ceed.Vector(num_elem * elem_qpts * num_q_comp) + op_build.apply(mesh_coords, q_data) + + # Setup QFunction for applying the mass operator + qf_mass = None + if args.gallery: + qf_mass = ceed.QFunctionByName("MassApply") + else: + build_ctx = ceed.QFunctionContext() + ctx_data = np.array([dim, dim], dtype=np.int32) + build_ctx.set_data(ctx_data) + + qfs_so = common.load_qfs_so() + file_dir = os.path.dirname(os.path.abspath(__file__)) + + qf_mass = ceed.QFunction(1, qfs_so.apply_mass, + os.path.join(file_dir, "ex1-volume.h:apply_mass")) + qf_mass.add_input("u", 1, libceed.EVAL_INTERP) + qf_mass.add_input("qdata", num_q_comp, libceed.EVAL_NONE) + qf_mass.add_output("v", 1, libceed.EVAL_INTERP) + qf_mass.set_context(build_ctx) + + # Create the mass operator + op_mass = ceed.Operator(qf_mass) + op_mass.set_field("u", solution_restriction, solution_basis, libceed.VECTOR_ACTIVE) + op_mass.set_field("qdata", q_data_restriction, libceed.BASIS_NONE, q_data) + op_mass.set_field("v", solution_restriction, solution_basis, libceed.VECTOR_ACTIVE) + + # Create solution vectors + u = ceed.Vector(sol_size) + v = ceed.Vector(sol_size) + u.set_value(1.0) # Set all entries of u to 1.0 + + # Apply mass operator: v = M * u + op_mass.apply(u, v) + + # Compute volume by summing all entries in v + volume = 0.0 + with v.array_read() as v_array: + # Simply sum all values to compute the volume + volume = np.sum(v_array) + + if not args.test: + print() + print(f"Exact mesh volume : {exact_volume:.14g}") + print(f"Computed mesh volume : {volume:.14g}") + print(f"Volume error : {volume - exact_volume:.14g}") + else: + # Test mode - check if error is within tolerance + tol = 200 * libceed.EPSILON if dim == 1 else 1e-5 + if abs(volume - exact_volume) > tol: + print(f"Volume error : {volume - exact_volume:.14g}") + sys.exit(1) + + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/examples/python/ex2_surface.py b/examples/python/ex2_surface.py new file mode 100644 index 0000000000..f741600110 --- /dev/null +++ b/examples/python/ex2_surface.py @@ -0,0 +1,186 @@ +#!/usr/bin/env python3 +# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. +# All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +# +# SPDX-License-Identifier: BSD-2-Clause +# +# This file is part of CEED: http://github.com/ceed +# +# libCEED example using diffusion operator to compute surface area +# +# Sample runs: +# +# python ex2_surface.py +# python ex2_surface.py -c /cpu/self +# python ex2_surface.py -c /gpu/cuda + +import sys +import os +import numpy as np +import libceed +import ex_common as common + + +def main(): + """Main driver for surface area example""" + args = common.parse_arguments() + return example_2(args) + + +def example_2(options): + """Compute surface area using diffusion operator + + Args: + args: Parsed command line arguments + + Returns: + int: 0 on success, error code on failure + """ + # Process arguments + args = options + dim = args.dim + mesh_degree = max(args.mesh_degree, args.solution_degree) + sol_degree = args.solution_degree + num_qpts = args.quadrature_points + problem_size = args.problem_size if args.problem_size > 0 else (500 * dim * dim if args.test else 256 * 1024) + ncomp_x = dim # Number of coordinate components + + # Print configuration + if not args.quiet: + print("Selected options: [command line option] : ") + print(f" Ceed specification [-c] : {args.ceed}") + print(f" Mesh dimension [-d] : {dim}") + print(f" Mesh degree [-m] : {mesh_degree}") + print(f" Solution degree [-p] : {sol_degree}") + print(f" Num. 1D quadr. pts [-q] : {num_qpts}") + print(f" Approx. # unknowns [-s] : {problem_size}") + print(f" QFunction source [-g] : {'gallery' if args.gallery else 'user'}") + + # Initialize CEED + ceed = libceed.Ceed(args.ceed) + + # Create bases + # Tensor-product Lagrange basis for mesh coordinates + mesh_basis = ceed.BasisTensorH1Lagrange( + dim, ncomp_x, mesh_degree + 1, num_qpts, libceed.GAUSS) + + # Tensor-product Lagrange basis for solution + solution_basis = ceed.BasisTensorH1Lagrange( + dim, 1, sol_degree + 1, num_qpts, libceed.GAUSS) + + # Create mesh + # Determine mesh size + num_xyz = common.get_cartesian_mesh_size(dim, sol_degree, problem_size) + if not args.quiet: + print("\nMesh size : nx = %d" % num_xyz[0], end="") + if dim > 1: + print(", ny = %d" % num_xyz[1], end="") + if dim > 2: + print(", nz = %d" % num_xyz[2], end="") + print() + + # Create element restrictions + num_q_comp = dim * (dim + 1) // 2 + mesh_restriction, mesh_size, _, _, _ = common.build_cartesian_restriction( + ceed, dim, num_xyz, mesh_degree, ncomp_x, num_q_comp, num_qpts, create_qdata=False) + solution_restriction, sol_size, q_data_restriction, num_elem, elem_qpts = common.build_cartesian_restriction( + ceed, dim, num_xyz, sol_degree, 1, num_q_comp, num_qpts, create_qdata=True) + + if not args.quiet: + print("Number of mesh nodes : %d" % (mesh_size // dim)) + print("Number of solution nodes : %d" % sol_size) + + # Create and transform mesh coordinates + mesh_coords = ceed.Vector(mesh_size) + common.set_cartesian_mesh_coords(ceed, dim, num_xyz, mesh_degree, mesh_coords) + _, exact_surface_area = common.transform_mesh_coords(dim, mesh_size, mesh_coords, use_sin=False) + + # Create the QFunction that builds the diffusion operator (i.e. computes + # its quadrature data) and set its context data + qf_build = None + if args.gallery: + qf_build = ceed.QFunctionByName(f"Poisson{dim}DBuild") + else: + build_ctx = ceed.QFunctionContext() + ctx_data = np.array([dim, dim], dtype=np.int32) + build_ctx.set_data(ctx_data) + + qfs_so = common.load_qfs_so() + file_dir = os.path.dirname(os.path.abspath(__file__)) + + qf_build = ceed.QFunction(1, qfs_so.build_diff, + os.path.join(file_dir, "ex2-surface.h:build_diff")) + qf_build.add_input("dx", dim * dim, libceed.EVAL_GRAD) + qf_build.add_input("weights", 1, libceed.EVAL_WEIGHT) + qf_build.add_output("qdata", num_q_comp, libceed.EVAL_NONE) + qf_build.set_context(build_ctx) + + # Operator for building quadrature data + op_build = ceed.Operator(qf_build) + op_build.set_field("dx", mesh_restriction, mesh_basis, libceed.VECTOR_ACTIVE) + op_build.set_field("weights", libceed.ELEMRESTRICTION_NONE, mesh_basis, libceed.VECTOR_NONE) + op_build.set_field("qdata", q_data_restriction, libceed.BASIS_NONE, libceed.VECTOR_ACTIVE) + + # Compute quadrature data + q_data = ceed.Vector(num_elem * elem_qpts * num_q_comp) + op_build.apply(mesh_coords, q_data) + + # Create the QFunction that defines the action of the diffusion operator + qf_diff = None + if args.gallery: + qf_diff = ceed.QFunctionByName(f"Poisson{dim}DApply") + else: + build_ctx = ceed.QFunctionContext() + ctx_data = np.array([dim, dim], dtype=np.int32) + build_ctx.set_data(ctx_data) + + qfs_so = common.load_qfs_so() + file_dir = os.path.dirname(os.path.abspath(__file__)) + + qf_diff = ceed.QFunction(1, qfs_so.apply_diff, + os.path.join(file_dir, "ex2-surface.h:apply_diff")) + qf_diff.add_input("du", dim, libceed.EVAL_GRAD) + qf_diff.add_input("qdata", num_q_comp, libceed.EVAL_NONE) + qf_diff.add_output("dv", dim, libceed.EVAL_GRAD) + qf_diff.set_context(build_ctx) + + # Diffusion operator + op_diff = ceed.Operator(qf_diff) + op_diff.set_field("du", solution_restriction, solution_basis, libceed.VECTOR_ACTIVE) + op_diff.set_field("qdata", q_data_restriction, libceed.BASIS_NONE, q_data) + op_diff.set_field("dv", solution_restriction, solution_basis, libceed.VECTOR_ACTIVE) + + # Create vectors + u = ceed.Vector(sol_size) # Input vector + v = ceed.Vector(sol_size) # Output vector + + # Initialize u with sum of coordinates (x + y + z) + with mesh_coords.array_read() as x_array, u.array_write() as u_array: + for i in range(sol_size): + u_array[i] = sum(x_array[i + j * (sol_size)] for j in range(dim)) + + # Apply operator: v = K * u + op_diff.apply(u, v) + + # Compute surface area by summing absolute values of v + surface_area = 0.0 + with v.array_read() as v_array: + surface_area = np.sum(abs(v_array)) + + if not args.test: + print() + print(f"Exact mesh surface area : {exact_surface_area:.14g}") + print(f"Computed mesh surface area : {surface_area:.14g}") + print(f"Surface area error : {surface_area - exact_surface_area:.14g}") + else: + # Test mode - check if error is within tolerance + tol = 10000 * libceed.EPSILON if dim == 1 else 1e-1 + if abs(surface_area - exact_surface_area) > tol: + print(f"Surface area error : {surface_area - exact_surface_area:.14g}") + sys.exit(1) + + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/examples/python/ex3_volume.py b/examples/python/ex3_volume.py new file mode 100644 index 0000000000..7fe6df7387 --- /dev/null +++ b/examples/python/ex3_volume.py @@ -0,0 +1,178 @@ +#!/usr/bin/env python3 +# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. +# All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +# +# SPDX-License-Identifier: BSD-2-Clause +# +# This file is part of CEED: http://github.com/ceed +# +# libCEED example using diffusion operator to compute surface area +# +# Sample runs: +# +# python ex3_volume.py +# python ex3_volume -c /cpu/self +# python ex3_volume -c /gpu/cuda + +import sys +import os +import numpy as np +import libceed +import ex_common as common + + +def main(): + """Main function for volume example""" + args = common.parse_arguments() + example_3(args) + + +def example_3(args): + """Compute volume using mass and diff operator + + Args: + args: Parsed command line arguments + + Returns: + int: 0 on success, error code on failure + """ + # Process arguments + dim = args.dim + mesh_degree = max(args.mesh_degree, args.solution_degree) + sol_degree = args.solution_degree + num_qpts = args.quadrature_points + problem_size = args.problem_size if args.problem_size > 0 else (8 * 16 if args.test else 256 * 1024) + ncomp_x = dim # Number of coordinate components + + # Print configuration + if not args.quiet: + print("Selected options: [command line option] : ") + print(f" Ceed specification [-c] : {args.ceed}") + print(f" Mesh dimension [-d] : {dim}") + print(f" Mesh degree [-m] : {mesh_degree}") + print(f" Solution degree [-p] : {sol_degree}") + print(f" Num. 1D quadr. pts [-q] : {num_qpts}") + print(f" Approx. # unknowns [-s] : {problem_size}") + print(f" QFunction source [-g] : {'gallery' if args.gallery else 'user'}") + + # Check - Gallery not supported + if args.gallery: + print("Gallery QFunction not supported for example 3") + sys.exit(1) + + # Initialize CEED + ceed = libceed.Ceed(args.ceed) + + # Create bases + # Tensor-product Lagrange basis for mesh coordinates + mesh_basis = ceed.BasisTensorH1Lagrange( + dim, ncomp_x, mesh_degree + 1, num_qpts, libceed.GAUSS) + + # Tensor-product Lagrange basis for solution + solution_basis = ceed.BasisTensorH1Lagrange( + dim, 1, sol_degree + 1, num_qpts, libceed.GAUSS) + + # Create mesh + # Determine mesh size + num_xyz = common.get_cartesian_mesh_size(dim, sol_degree, problem_size) + if not args.quiet: + print("\nMesh size : nx = %d" % num_xyz[0], end="") + if dim > 1: + print(", ny = %d" % num_xyz[1], end="") + if dim > 2: + print(", nz = %d" % num_xyz[2], end="") + print() + + # Create element restrictions + num_q_comp = 1 + dim * (dim + 1) // 2 + mesh_restriction, mesh_size, _, _, _ = common.build_cartesian_restriction( + ceed, dim, num_xyz, mesh_degree, ncomp_x, num_q_comp, num_qpts, create_qdata=False) + solution_restriction, sol_size, q_data_restriction, num_elem, elem_qpts = common.build_cartesian_restriction( + ceed, dim, num_xyz, sol_degree, 1, num_q_comp, num_qpts, create_qdata=True) + + if not args.quiet: + print("Number of mesh nodes : %d" % (mesh_size // dim)) + print("Number of solution nodes : %d" % sol_size) + + # Create and transform mesh coordinates + mesh_coords = ceed.Vector(mesh_size) + common.set_cartesian_mesh_coords(ceed, dim, num_xyz, mesh_degree, mesh_coords) + exact_volume, _ = common.transform_mesh_coords(dim, mesh_size, mesh_coords) + + # Create QFunction context + build_ctx = ceed.QFunctionContext() + ctx_data = np.array([dim, dim], dtype=np.int32) + build_ctx.set_data(ctx_data) + + # Load QFunctions + qfs_so = common.load_qfs_so() + file_dir = os.path.dirname(os.path.abspath(__file__)) + + # Create the QFunction that builds the mass + diffusion operator (i.e. + # computes its quadrature data) and set its context data + qf_build = ceed.QFunction(1, qfs_so.build_mass_diff, + os.path.join(file_dir, "ex3-volume.h:build_mass_diff")) + qf_build.add_input("dx", dim * dim, libceed.EVAL_GRAD) + qf_build.add_input("weights", 1, libceed.EVAL_WEIGHT) + qf_build.add_output("qdata", num_q_comp, libceed.EVAL_NONE) + qf_build.set_context(build_ctx) + + # Create the operator that builds the quadrature data for the mass + diffusion operator + op_build = ceed.Operator(qf_build) + op_build.set_field("dx", mesh_restriction, mesh_basis, libceed.VECTOR_ACTIVE) + op_build.set_field("weights", libceed.ELEMRESTRICTION_NONE, mesh_basis, libceed.VECTOR_NONE) + op_build.set_field("qdata", q_data_restriction, libceed.BASIS_NONE, libceed.VECTOR_ACTIVE) + + # Compute the quadrature data for the mass + diffusion operator + q_data = ceed.Vector(num_elem * elem_qpts * num_q_comp) + op_build.apply(mesh_coords, q_data) + + # Create the QFunction that defines the action of the mass + diffusion operator + qf_apply = ceed.QFunction(1, qfs_so.apply_mass_diff, + os.path.join(file_dir, "ex3-volume.h:apply_mass_diff")) + qf_apply.add_input("u", 1, libceed.EVAL_INTERP) + qf_apply.add_input("du", dim, libceed.EVAL_GRAD) + qf_apply.add_input("qdata", num_q_comp, libceed.EVAL_NONE) + qf_apply.add_output("v", 1, libceed.EVAL_INTERP) + qf_apply.add_output("dv", dim, libceed.EVAL_GRAD) + qf_apply.set_context(build_ctx) + + # Create the mass + diffusion operator + op_apply = ceed.Operator(qf_apply) + op_apply.set_field("u", solution_restriction, solution_basis, libceed.VECTOR_ACTIVE) + op_apply.set_field("du", solution_restriction, solution_basis, libceed.VECTOR_ACTIVE) + op_apply.set_field("qdata", q_data_restriction, libceed.BASIS_NONE, q_data) + op_apply.set_field("v", solution_restriction, solution_basis, libceed.VECTOR_ACTIVE) + op_apply.set_field("dv", solution_restriction, solution_basis, libceed.VECTOR_ACTIVE) + + # Create solution vectors + u = ceed.Vector(sol_size) + v = ceed.Vector(sol_size) + u.set_value(1.0) # Set all entries of u to 1.0 + + # Apply mass + diffusion operator: v = (M + K) * u + op_apply.apply(u, v) + + # Compute volume by summing all entries in v + volume = 0.0 + with v.array_read() as v_array: + # Simply sum all values to compute the volume + volume = np.sum(v_array) + + if not args.test: + print() + print(f"Exact mesh volume : {exact_volume:.14g}") + print(f"Computed mesh volume : {volume:.14g}") + print(f"Volume error : {volume - exact_volume:.14g}") + else: + # Test mode - check if error is within tolerance + tol = 200 * libceed.EPSILON if dim == 1 else 1e-5 + if abs(volume - exact_volume) > tol: + print(f"Volume error : {volume - exact_volume:.14g}") + sys.exit(1) + + return 0 + + +if __name__ == "__main__": + main() diff --git a/examples/python/ex_common.py b/examples/python/ex_common.py new file mode 100644 index 0000000000..00e75805fb --- /dev/null +++ b/examples/python/ex_common.py @@ -0,0 +1,255 @@ +# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors +# All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +# +# SPDX-License-Identifier: BSD-2-Clause +# +# This file is part of CEED: http://github.com/ceed + +import sys +import os +from sysconfig import get_config_var +import argparse +import math +import numpy as np +import libceed +import ctypes + + +def parse_arguments(): + """Parse command line arguments for surface area computation + + Returns: + Namespace: Parsed arguments with fields: + ceed: CEED resource specifier + dim: Problem dimension (1-3) + mesh_degree: Mesh polynomial degree + solution_degree: Solution polynomial degree + num_qpts: Number of quadrature points + problem_size: Approximate problem size + test: Test mode flag + quiet: Suppress output flag + gallery: Use gallery QFunctions flag + """ + parser = argparse.ArgumentParser(description="libCEED surface area example") + parser.add_argument("-c", "--ceed", default="/cpu/self", + help="libCEED resource specifier (default: /cpu/self)") + parser.add_argument("-d", "--dim", type=int, default=3, + help="Problem dimension (1-3) (default: 3)") + parser.add_argument("-m", "--mesh-degree", type=int, default=4, + help="Mesh polynomial degree (default: 4)") + parser.add_argument("-p", "--solution-degree", type=int, default=4, + help="Solution polynomial degree (default: 4)") + parser.add_argument("-q", "--quadrature-points", type=int, default=6, + help="Number of quadrature points (default: 6)") + parser.add_argument("-s", "--problem-size", type=int, default=-1, + help="Approximate problem size (default: ~256k)") + parser.add_argument("-t", "--test", action="store_true", + help="Test mode (reduced problem size)") + parser.add_argument("--quiet", action="store_true", + help="Suppress output") + parser.add_argument("-g", "--gallery", action="store_true", + help="Use gallery QFunctions") + + args = parser.parse_args() + if args.dim not in [1, 2, 3]: + parser.error("Dimension must be 1, 2, or 3") + return args + + +def get_cartesian_mesh_size(dim, degree, prob_size): + """Determine Cartesian mesh size for given problem size + + Args: + dim: Spatial dimension (1-3) + degree: Polynomial degree + prob_size: Target problem size + + Returns: + list: Number of elements in each dimension + """ + # Calculate number of elements needed + num_elem = prob_size // (degree ** dim) + + # Find smallest power of 2 >= num_elem + s = 0 + while num_elem > 1: + num_elem = num_elem / 2 + s += 1 + + # Distribute across dimensions + r = s % dim + num_xyz = [] + for d in range(dim): + sd = s // dim + if r > 0: + sd += 1 + r -= 1 + num_xyz.append(1 << sd) + return num_xyz + + +def build_cartesian_restriction(ceed, dim, num_xyz, degree, num_comp, num_q_comp, num_qpts, create_qdata=False): + """Build element restriction for Cartesian grid + + Args: + ceed: libCEED context + dim: Spatial dimension + num_xyz: Elements per dimension + degree: Polynomial degree + num_comp: Number of components + num_q_comp: Number of quadrature data components + num_qpts: Quadrature points per dimension + build_qdata: Flag to build restriction for quadrature data + + Returns: + tuple: (elem_restriction, size, q_data_restriction, num_elem, elem_qpts) + """ + p = degree + 1 # Nodes per element per dimension + num_nodes = p ** dim + elem_qpts = num_qpts ** dim + + # Calculate grid parameters + nd = [] + num_elem = 1 + scalar_size = 1 + for d in range(dim): + num_elem *= num_xyz[d] + nd.append(num_xyz[d] * (p - 1) + 1) # Nodes per dimension + scalar_size *= nd[d] + + size = scalar_size * num_comp + + # Create element connectivity + elem_nodes = np.zeros(num_elem * num_nodes, dtype=np.int32) + for e in range(num_elem): + # Get element coordinates + e_xyz = [0] * dim + re = e + for d in range(dim): + e_xyz[d] = re % num_xyz[d] + re //= num_xyz[d] + + # Calculate global node numbers + for n in range(num_nodes): + g_node = 0 + g_stride = 1 + r_node = n + for d in range(dim): + g_node += (e_xyz[d] * (p - 1) + r_node % p) * g_stride + g_stride *= nd[d] + r_node //= p + elem_nodes[e * num_nodes + n] = g_node + + # Create restrictions + elem_restriction = ceed.ElemRestriction( + num_elem, num_nodes, num_comp, scalar_size, size, elem_nodes) + + q_data_restriction = None + if create_qdata: + strides = np.array([1, elem_qpts, elem_qpts * num_q_comp], dtype=np.int32) + q_data_restriction = ceed.StridedElemRestriction( + num_elem, elem_qpts, num_q_comp, num_elem * elem_qpts * num_q_comp, strides) + + return elem_restriction, size, q_data_restriction, num_elem, elem_qpts + + +def set_cartesian_mesh_coords(ceed, dim, num_xyz, mesh_degree, mesh_coords): + """Create Cartesian mesh coordinates + + Args: + ceed: libCEED context + dim: Spatial dimension + num_xyz: Elements per dimension + mesh_degree: Mesh polynomial degree + mesh_coords: CeedVector to hold mesh coordinates + + Returns: + Vector: Mesh coordinates + """ + p = mesh_degree + 1 + nd = [] + scalar_size = 1 + for d in range(dim): + nd.append(num_xyz[d] * (p - 1) + 1) + scalar_size *= nd[d] + + # Get Lobatto nodes (quadrature points) + nodes, _ = ceed.lobatto_quadrature(p) + nodes = 0.5 + 0.5 * nodes # Map from [-1,1] to [0,1] + + # Create coordinates + coords = np.zeros(scalar_size * dim) + for gs_node in range(scalar_size): + r_node = gs_node + for d in range(dim): + d_1d = r_node % nd[d] + elem_id = d_1d // (p - 1) + node_id = d_1d % (p - 1) + coords[gs_node + scalar_size * d] = (elem_id + nodes[node_id]) / num_xyz[d] + r_node //= nd[d] + + mesh_coords.set_array(coords, cmode=libceed.COPY_VALUES) + return mesh_coords + + +def transform_mesh_coords(dim, mesh_size, mesh_coords, use_sin=True): + """Transform mesh coordinates and return exact surface area + + Args: + dim: Spatial dimension + mesh_size: Total mesh vector size + mesh_coords: Mesh coordinates vector + use_sin: Use sinusoidal transformation + + Returns: + float: Tuple with exact volume and surface area for transformed mesh + """ + exact_volume = {1: 1.0, 2: 3. / 4. * np.pi, 3: 3. / 4. * np.pi}[dim] + exact_area = {1: 2.0, 2: 4.0, 3: 6.0}[dim] + + # Apply transformation to coordinates + num_nodes = mesh_size // dim + with mesh_coords.array_write() as coords: + if dim == 1: + for i in range(num_nodes): + x = coords[i] - 0.5 + coords[i] = 0.5 + (1.0 / np.sqrt(3.0)) * np.sin((2.0 / 3.0) * np.pi * x) + else: + if use_sin: + for i in range(num_nodes): + u = 1. + coords[i] + v = np.pi / 2. * coords[i + num_nodes] + coords[i] = u * np.cos(v) + coords[i + num_nodes] = u * np.sin(v) + else: + for i in range(num_nodes): + x = coords[i] - 0.5 + coords[i] = 0.5 + (1.0 / np.sqrt(3.0)) * np.sin((2.0 / 3.0) * np.pi * x) + + return (exact_volume, exact_area) + + +def find_qfs_so(name, path): + """Find the QFunctions shared library. + Returns: + Filepath to shared library object + """ + for root, dirs, files in os.walk(path): + if name in files: + return os.path.join(root, name) + + +def load_qfs_so(): + """Load the QFunctions shared library. + Returns: + Loaded shared library object + """ + file_dir = os.path.join( + os.path.dirname(os.path.abspath(__file__)), + "build") + qfs_so = find_qfs_so( + "libceed_c_qfunctions" + get_config_var("EXT_SUFFIX"), + file_dir) + + # Load library + return ctypes.cdll.LoadLibrary(qfs_so) diff --git a/examples/python/ex_test.py b/examples/python/ex_test.py new file mode 100644 index 0000000000..4d9cbf1e6a --- /dev/null +++ b/examples/python/ex_test.py @@ -0,0 +1,269 @@ +#!/usr/bin/env python3 +# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors +# All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +# +# SPDX-License-Identifier: BSD-2-Clause +# +# This file is part of CEED: http://github.com/ceed + +import pytest +from argparse import Namespace +import ex1_volume +import ex2_surface +import ex3_volume + +# ------------------------------------------------------------------------------- + + +def test_101(ceed_resource): + args = Namespace( + ceed=ceed_resource, + dim=1, + mesh_degree=4, + solution_degree=4, + quadrature_points=6, + problem_size=-1, + test=True, + quiet=True, + gallery=False, + ) + ex1_volume.example_1(args) + +# ------------------------------------------------------------------------------- + + +def test_101g(ceed_resource): + args = Namespace( + ceed=ceed_resource, + dim=1, + mesh_degree=4, + solution_degree=4, + quadrature_points=6, + problem_size=-1, + test=True, + quiet=True, + gallery=True, + ) + ex1_volume.example_1(args) + +# ------------------------------------------------------------------------------- + + +def test_102(ceed_resource): + args = Namespace( + ceed=ceed_resource, + dim=2, + mesh_degree=4, + solution_degree=4, + quadrature_points=6, + problem_size=-1, + test=True, + quiet=True, + gallery=False, + ) + ex1_volume.example_1(args) + +# ------------------------------------------------------------------------------- + + +def test_102g(ceed_resource): + args = Namespace( + ceed=ceed_resource, + dim=2, + mesh_degree=4, + solution_degree=4, + quadrature_points=6, + problem_size=-1, + test=True, + quiet=True, + gallery=True, + ) + ex1_volume.example_1(args) + +# ------------------------------------------------------------------------------- + + +def test_103(ceed_resource): + args = Namespace( + ceed=ceed_resource, + dim=3, + mesh_degree=4, + solution_degree=4, + quadrature_points=6, + problem_size=-1, + test=True, + quiet=True, + gallery=False, + ) + ex1_volume.example_1(args) + +# ------------------------------------------------------------------------------- + + +def test_103g(ceed_resource): + args = Namespace( + ceed=ceed_resource, + dim=3, + mesh_degree=4, + solution_degree=4, + quadrature_points=6, + problem_size=-1, + test=True, + quiet=True, + gallery=True, + ) + ex1_volume.example_1(args) + + +# ------------------------------------------------------------------------------- +def test_201(ceed_resource): + args = Namespace( + ceed=ceed_resource, + dim=1, + mesh_degree=4, + solution_degree=4, + quadrature_points=6, + problem_size=-1, + test=True, + quiet=True, + gallery=False, + ) + ex2_surface.example_2(args) + +# ------------------------------------------------------------------------------- + + +def test_201g(ceed_resource): + args = Namespace( + ceed=ceed_resource, + dim=1, + mesh_degree=4, + solution_degree=4, + quadrature_points=6, + problem_size=-1, + test=True, + quiet=True, + gallery=True, + ) + ex2_surface.example_2(args) + +# ------------------------------------------------------------------------------- + + +def test_202(ceed_resource): + args = Namespace( + ceed=ceed_resource, + dim=2, + mesh_degree=4, + solution_degree=4, + quadrature_points=6, + problem_size=-1, + test=True, + quiet=True, + gallery=False, + ) + ex2_surface.example_2(args) + +# ------------------------------------------------------------------------------- + + +def test_202g(ceed_resource): + args = Namespace( + ceed=ceed_resource, + dim=2, + mesh_degree=4, + solution_degree=4, + quadrature_points=6, + problem_size=-1, + test=True, + quiet=True, + gallery=True, + ) + ex2_surface.example_2(args) + +# ------------------------------------------------------------------------------- + + +def test_203(ceed_resource): + args = Namespace( + ceed=ceed_resource, + dim=3, + mesh_degree=4, + solution_degree=4, + quadrature_points=6, + problem_size=-1, + test=True, + quiet=True, + gallery=False, + ) + ex2_surface.example_2(args) + +# ------------------------------------------------------------------------------- + + +def test_203g(ceed_resource): + args = Namespace( + ceed=ceed_resource, + dim=3, + mesh_degree=4, + solution_degree=4, + quadrature_points=6, + problem_size=-1, + test=True, + quiet=True, + gallery=True, + ) + ex2_surface.example_2(args) + +# ------------------------------------------------------------------------------- + + +def test_301(ceed_resource): + args = Namespace( + ceed=ceed_resource, + dim=1, + mesh_degree=4, + solution_degree=4, + quadrature_points=6, + problem_size=-1, + test=True, + quiet=True, + gallery=False, + ) + ex3_volume.example_3(args) + +# ------------------------------------------------------------------------------- + + +def test_302(ceed_resource): + args = Namespace( + ceed=ceed_resource, + dim=2, + mesh_degree=4, + solution_degree=4, + quadrature_points=6, + problem_size=-1, + test=True, + quiet=True, + gallery=False, + ) + ex3_volume.example_3(args) + +# ------------------------------------------------------------------------------- + + +def test_303(ceed_resource): + args = Namespace( + ceed=ceed_resource, + dim=3, + mesh_degree=4, + solution_degree=4, + quadrature_points=6, + problem_size=-1, + test=True, + quiet=True, + gallery=False, + ) + ex3_volume.example_3(args) + +# ------------------------------------------------------------------------------- diff --git a/backends/occa/kernels/kernel-defines.hpp b/examples/python/qfunctions/ex-common.h similarity index 51% rename from backends/occa/kernels/kernel-defines.hpp rename to examples/python/qfunctions/ex-common.h index beb0c79624..32b867b67f 100644 --- a/backends/occa/kernels/kernel-defines.hpp +++ b/examples/python/qfunctions/ex-common.h @@ -1,13 +1,14 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause // // This file is part of CEED: http://github.com/ceed +#pragma once -#ifndef CEED_OCCA_KERNELS_KERNELDEFINES_HEADER -#define CEED_OCCA_KERNELS_KERNELDEFINES_HEADER +#include -#define STRINGIFY_SOURCE(...) #__VA_ARGS__ - -#endif +/// A structure used to pass additional data +struct BuildContext { + CeedInt dim, space_dim; +}; diff --git a/examples/python/qfunctions/ex1-volume.h b/examples/python/qfunctions/ex1-volume.h new file mode 100644 index 0000000000..907b77bf5c --- /dev/null +++ b/examples/python/qfunctions/ex1-volume.h @@ -0,0 +1,60 @@ +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. +// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +// +// SPDX-License-Identifier: BSD-2-Clause +// +// This file is part of CEED: http://github.com/ceed +#pragma once + +#include +#include "ex-common.h" + +/// libCEED Q-function for building quadrature data for a mass operator +CEED_QFUNCTION(build_mass)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { + // in[0] is Jacobians with shape [dim, dim, Q] + // in[1] is quadrature weights with shape [1, Q] + const CeedScalar *w = in[1]; + CeedScalar *q_data = out[0]; + struct BuildContext *build_data = (struct BuildContext *)ctx; + + switch (build_data->dim + 10 * build_data->space_dim) { + case 11: { + const CeedScalar(*J)[1][CEED_Q_VLA] = (const CeedScalar(*)[1][CEED_Q_VLA])in[0]; + + // Quadrature Point Loop + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { q_data[i] = J[0][0][i] * w[i]; } // End of Quadrature Point Loop + } break; + case 22: { + const CeedScalar(*J)[2][CEED_Q_VLA] = (const CeedScalar(*)[2][CEED_Q_VLA])in[0]; + + // Quadrature Point Loop + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { + q_data[i] = (J[0][0][i] * J[1][1][i] - J[0][1][i] * J[1][0][i]) * w[i]; + } // End of Quadrature Point Loop + } break; + case 33: { + const CeedScalar(*J)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[0]; + + // Quadrature Point Loop + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { + q_data[i] = + (J[0][0][i] * (J[1][1][i] * J[2][2][i] - J[1][2][i] * J[2][1][i]) - J[0][1][i] * (J[1][0][i] * J[2][2][i] - J[1][2][i] * J[2][0][i]) + + J[0][2][i] * (J[1][0][i] * J[2][1][i] - J[1][1][i] * J[2][0][i])) * + w[i]; + } // End of Quadrature Point Loop + } break; + } + return CEED_ERROR_SUCCESS; +} + +/// libCEED Q-function for applying a mass operator +CEED_QFUNCTION(apply_mass)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { + // in[0], out[0] are solution variables with shape [1, Q] + // in[1] is quadrature data with shape [1, Q] + const CeedScalar *u = in[0], *q_data = in[1]; + CeedScalar *v = out[0]; + + // Quadrature Point Loop + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { v[i] = q_data[i] * u[i]; } // End of Quadrature Point Loop + return CEED_ERROR_SUCCESS; +} diff --git a/examples/python/qfunctions/ex2-surface.h b/examples/python/qfunctions/ex2-surface.h new file mode 100644 index 0000000000..980a952105 --- /dev/null +++ b/examples/python/qfunctions/ex2-surface.h @@ -0,0 +1,135 @@ +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. +// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +// +// SPDX-License-Identifier: BSD-2-Clause +// +// This file is part of CEED: http://github.com/ceed +#pragma once + +#include +#include "ex-common.h" + +/// libCEED Q-function for building quadrature data for a diffusion operator +CEED_QFUNCTION(build_diff)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { + // in[0] is Jacobians with shape [dim, dim, Q] + // in[1] is quadrature weights, size (Q) + const CeedScalar *w = in[1]; + CeedScalar(*q_data)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[0]; + struct BuildContext *build_data = (struct BuildContext *)ctx; + + // At every quadrature point, compute w/det(J).adj(J).adj(J)^T and store + // the symmetric part of the result. + switch (build_data->dim + 10 * build_data->space_dim) { + case 11: { + const CeedScalar(*J)[1][CEED_Q_VLA] = (const CeedScalar(*)[1][CEED_Q_VLA])in[0]; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { q_data[0][i] = w[i] / J[0][0][i]; } // End of Quadrature Point Loop + } break; + case 22: { + const CeedScalar(*J)[2][CEED_Q_VLA] = (const CeedScalar(*)[2][CEED_Q_VLA])in[0]; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { + // J: 0 2 q_data: 0 2 adj(J): J11 -J01 + // 1 3 2 1 -J10 J00 + const CeedScalar J00 = J[0][0][i]; + const CeedScalar J10 = J[0][1][i]; + const CeedScalar J01 = J[1][0][i]; + const CeedScalar J11 = J[1][1][i]; + const CeedScalar qw = w[i] / (J00 * J11 - J10 * J01); + + q_data[0][i] = qw * (J01 * J01 + J11 * J11); + q_data[1][i] = qw * (J00 * J00 + J10 * J10); + q_data[2][i] = -qw * (J00 * J01 + J10 * J11); + } // End of Quadrature Point Loop + } break; + case 33: { + const CeedScalar(*J)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[0]; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { + // Compute the adjoint + CeedScalar A[3][3]; + + for (CeedInt j = 0; j < 3; j++) { + for (CeedInt k = 0; k < 3; k++) { + // Equivalent code with J as a VLA and no mod operations: + // A[k][j] = J[j+1][k+1]*J[j+2][k+2] - J[j+1][k+2]*J[j+2][k+1] + A[k][j] = + J[(k + 1) % 3][(j + 1) % 3][i] * J[(k + 2) % 3][(j + 2) % 3][i] - J[(k + 2) % 3][(j + 1) % 3][i] * J[(k + 1) % 3][(j + 2) % 3][i]; + } + } + + // Compute quadrature weight / det(J) + const CeedScalar qw = w[i] / (J[0][0][i] * A[0][0] + J[0][1][i] * A[0][1] + J[0][2][i] * A[0][2]); + + // Compute geometric factors + // Stored in Voigt convention + // 0 5 4 + // 5 1 3 + // 4 3 2 + q_data[0][i] = qw * (A[0][0] * A[0][0] + A[0][1] * A[0][1] + A[0][2] * A[0][2]); + q_data[1][i] = qw * (A[1][0] * A[1][0] + A[1][1] * A[1][1] + A[1][2] * A[1][2]); + q_data[2][i] = qw * (A[2][0] * A[2][0] + A[2][1] * A[2][1] + A[2][2] * A[2][2]); + q_data[3][i] = qw * (A[1][0] * A[2][0] + A[1][1] * A[2][1] + A[1][2] * A[2][2]); + q_data[4][i] = qw * (A[0][0] * A[2][0] + A[0][1] * A[2][1] + A[0][2] * A[2][2]); + q_data[5][i] = qw * (A[0][0] * A[1][0] + A[0][1] * A[1][1] + A[0][2] * A[1][2]); + } // End of Quadrature Point Loop + } break; + } + return CEED_ERROR_SUCCESS; +} + +/// libCEED Q-function for applying a diff operator +CEED_QFUNCTION(apply_diff)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { + struct BuildContext *build_data = (struct BuildContext *)ctx; + // in[0], out[0] solution gradients with shape [dim, 1, Q] + // in[1] is quadrature data with shape [num_components, Q] + const CeedScalar(*q_data)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[1]; + + switch (build_data->dim) { + case 1: { + const CeedScalar *ug = in[0]; + CeedScalar *vg = out[0]; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { vg[i] = ug[i] * q_data[0][i]; } // End of Quadrature Point Loop + } break; + case 2: { + const CeedScalar(*ug)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[0]; + CeedScalar(*vg)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[0]; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { + // Read q_data (dXdxdXdx_T symmetric matrix) + // Stored in Voigt convention + // 0 2 + // 2 1 + const CeedScalar dXdxdXdx_T[2][2] = { + {q_data[0][i], q_data[2][i]}, + {q_data[2][i], q_data[1][i]} + }; + + // j = direction of vg + for (int j = 0; j < 2; j++) vg[j][i] = (ug[0][i] * dXdxdXdx_T[0][j] + ug[1][i] * dXdxdXdx_T[1][j]); + } // End of Quadrature Point Loop + } break; + case 3: { + const CeedScalar(*ug)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[0]; + CeedScalar(*vg)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[0]; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { + // Read q_data (dXdxdXdx_T symmetric matrix) + // Stored in Voigt convention + // 0 5 4 + // 5 1 3 + // 4 3 2 + const CeedScalar dXdxdXdx_T[3][3] = { + {q_data[0][i], q_data[5][i], q_data[4][i]}, + {q_data[5][i], q_data[1][i], q_data[3][i]}, + {q_data[4][i], q_data[3][i], q_data[2][i]} + }; + + // j = direction of vg + for (int j = 0; j < 3; j++) vg[j][i] = (ug[0][i] * dXdxdXdx_T[0][j] + ug[1][i] * dXdxdXdx_T[1][j] + ug[2][i] * dXdxdXdx_T[2][j]); + } // End of Quadrature Point Loop + } break; + } + return CEED_ERROR_SUCCESS; +} diff --git a/examples/python/qfunctions/ex3-volume.h b/examples/python/qfunctions/ex3-volume.h new file mode 100644 index 0000000000..1a992480cc --- /dev/null +++ b/examples/python/qfunctions/ex3-volume.h @@ -0,0 +1,168 @@ +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. +// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +// +// SPDX-License-Identifier: BSD-2-Clause +// +// This file is part of CEED: http://github.com/ceed +#pragma once + +#include +#include "ex-common.h" + +/// libCEED Q-function for building quadrature data for a mass + diffusion operator +CEED_QFUNCTION(build_mass_diff)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { + // in[0] is Jacobians with shape [dim, dim, Q] + // in[1] is quadrature weights, size (Q) + const CeedScalar *w = in[1]; + CeedScalar(*q_data)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[0]; + struct BuildContext *build_data = (struct BuildContext *)ctx; + + // At every quadrature point, compute w/det(J).adj(J).adj(J)^T and store + // the symmetric part of the result. + switch (build_data->dim + 10 * build_data->space_dim) { + case 11: { // dim = 1, space_dim = 1 + const CeedScalar(*J)[1][CEED_Q_VLA] = (const CeedScalar(*)[1][CEED_Q_VLA])in[0]; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { + // Mass + q_data[0][i] = w[i] * J[0][0][i]; + + // Diffusion + q_data[1][i] = w[i] / J[0][0][i]; + } + } break; + case 22: { // dim = 2, space_dim = 2 + const CeedScalar(*J)[2][CEED_Q_VLA] = (const CeedScalar(*)[2][CEED_Q_VLA])in[0]; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { + // J: 0 2 q_data: 0 2 adj(J): J22 -J12 + // 1 3 2 1 -J10 J00 + const CeedScalar J00 = J[0][0][i]; + const CeedScalar J10 = J[0][1][i]; + const CeedScalar J01 = J[1][0][i]; + const CeedScalar J11 = J[1][1][i]; + const CeedScalar qw = w[i] / (J00 * J11 - J10 * J01); + + // Mass + q_data[0][i] = w[i] * (J00 * J11 - J10 * J01); + + // Diffusion + q_data[1][i] = qw * (J01 * J01 + J11 * J11); + q_data[2][i] = qw * (J00 * J00 + J10 * J10); + q_data[3][i] = -qw * (J00 * J01 + J10 * J11); + } + } break; + case 33: { // dim = 3, space_dim = 3 + const CeedScalar(*J)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[0]; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { + // Compute the adjoint + CeedScalar A[3][3]; + for (CeedInt j = 0; j < 3; j++) { + for (CeedInt k = 0; k < 3; k++) { + A[k][j] = + J[(k + 1) % 3][(j + 1) % 3][i] * J[(k + 2) % 3][(j + 2) % 3][i] - J[(k + 2) % 3][(j + 1) % 3][i] * J[(k + 1) % 3][(j + 2) % 3][i]; + } + } + + // Compute quadrature weight / det(J) + const CeedScalar qw = w[i] / (J[0][0][i] * A[0][0] + J[0][1][i] * A[0][1] + J[0][2][i] * A[0][2]); + + // Mass + q_data[0][i] = w[i] * (J[0][0][i] * A[0][0] + J[0][1][i] * A[0][1] + J[0][2][i] * A[0][2]); + + // Diffusion + // Stored in Voigt convention + // 1 6 5 + // 6 2 4 + // 5 4 3 + q_data[1][i] = qw * (A[0][0] * A[0][0] + A[0][1] * A[0][1] + A[0][2] * A[0][2]); + q_data[2][i] = qw * (A[1][0] * A[1][0] + A[1][1] * A[1][1] + A[1][2] * A[1][2]); + q_data[3][i] = qw * (A[2][0] * A[2][0] + A[2][1] * A[2][1] + A[2][2] * A[2][2]); + q_data[4][i] = qw * (A[1][0] * A[2][0] + A[1][1] * A[2][1] + A[1][2] * A[2][2]); + q_data[5][i] = qw * (A[0][0] * A[2][0] + A[0][1] * A[2][1] + A[0][2] * A[2][2]); + q_data[6][i] = qw * (A[0][0] * A[1][0] + A[0][1] * A[1][1] + A[0][2] * A[1][2]); + } + } break; + } + return CEED_ERROR_SUCCESS; +} + +/// libCEED Q-function for applying a mass + diffusion operator +CEED_QFUNCTION(apply_mass_diff)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { + struct BuildContext *build_data = (struct BuildContext *)ctx; + // in[0], out[0] solution values with shape [1, 1, Q] + // in[1], out[1] solution gradients with shape [dim, 1, Q] + // in[2] is quadrature data with shape [num_components, Q] + const CeedScalar(*q_data)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[2]; + + switch (build_data->dim) { + case 1: { + const CeedScalar *u = in[0], *ug = in[1]; + CeedScalar *v = out[0], *vg = out[1]; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { + // Mass + v[i] = q_data[0][i] * u[i]; + + // Diffusion + vg[i] = q_data[1][i] * ug[i]; + } + } break; + case 2: { + const CeedScalar *u = in[0]; + const CeedScalar(*ug)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[1]; + CeedScalar *v = out[0]; + CeedScalar(*vg)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[1]; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { + // Mass + v[i] = q_data[0][i] * u[i]; + + // Diffusion + // Read q_data (dXdxdXdx_T symmetric matrix) + // Stored in Voigt convention + // 1 3 + // 3 2 + const CeedScalar dXdxdXdx_T[2][2] = { + {q_data[1][i], q_data[3][i]}, + {q_data[3][i], q_data[2][i]} + }; + + // j = direction of vg + for (int j = 0; j < 2; j++) { + vg[j][i] = (ug[0][i] * dXdxdXdx_T[0][j] + ug[1][i] * dXdxdXdx_T[1][j]); + } + } + } break; + case 3: { + const CeedScalar *u = in[0]; + const CeedScalar(*ug)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[1]; + CeedScalar *v = out[0]; + CeedScalar(*vg)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[1]; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { + // Mass + v[i] = q_data[0][i] * u[i]; + + // Diffusion + // Read q_data (dXdxdXdx_T symmetric matrix) + // Stored in Voigt convention + // 1 6 5 + // 6 2 4 + // 5 4 3 + const CeedScalar dXdxdXdx_T[3][3] = { + {q_data[1][i], q_data[6][i], q_data[5][i]}, + {q_data[6][i], q_data[2][i], q_data[4][i]}, + {q_data[5][i], q_data[4][i], q_data[3][i]} + }; + + // j = direction of vg + for (int j = 0; j < 3; j++) { + vg[j][i] = (ug[0][i] * dXdxdXdx_T[0][j] + ug[1][i] * dXdxdXdx_T[1][j] + ug[2][i] * dXdxdXdx_T[2][j]); + } + } + } break; + } + return CEED_ERROR_SUCCESS; +} diff --git a/examples/python/qfunctions/qfunctions.c b/examples/python/qfunctions/qfunctions.c new file mode 100644 index 0000000000..ee41a501a7 --- /dev/null +++ b/examples/python/qfunctions/qfunctions.c @@ -0,0 +1,22 @@ +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. +// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +// +// SPDX-License-Identifier: BSD-2-Clause +// +// This file is part of CEED: http://github.com/ceed + +#include +// ----------------------------------------------------------------------------- +// Redefine QFunction Macro +// ----------------------------------------------------------------------------- +#undef CEED_QFUNCTION +#define CEED_QFUNCTION(name) extern int name + +// ----------------------------------------------------------------------------- +// QFunction Sources +// ----------------------------------------------------------------------------- +#include "ex1-volume.h" +#include "ex2-surface.h" +#include "ex3-volume.h" + +// ----------------------------------------------------------------------------- diff --git a/examples/python/setup_qfunctions.py b/examples/python/setup_qfunctions.py new file mode 100644 index 0000000000..8c337621e7 --- /dev/null +++ b/examples/python/setup_qfunctions.py @@ -0,0 +1,32 @@ +from setuptools import setup, Extension +from sys import platform +import os + +# Get CEED directory +ceed_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +# Include directories +include_dirs = [os.path.join(ceed_dir, "include")] + +# Library directories +library_dirs = [os.path.join(ceed_dir, "lib")] + +# Source files +sources = ["qfunctions/qfunctions.c"] + +# Compiler arguments +extra_compile_args = [] +if platform == "linux" or platform == "linux2" or platform == "darwin": + extra_compile_args = ["-O3", "-march=native", "-std=c11"] + +# Define the extension module +qfunctions = Extension("libceed_c_qfunctions", + sources=sources, + include_dirs=include_dirs, + library_dirs=library_dirs, + libraries=["ceed"], + extra_compile_args=extra_compile_args) + +# Setup +setup(name="libceed_c_qfunctions", + ext_modules=[qfunctions]) diff --git a/examples/python/tutorial-0-ceed.ipynb b/examples/python/tutorial-0-ceed.ipynb index b1d712a552..801081154f 100644 --- a/examples/python/tutorial-0-ceed.ipynb +++ b/examples/python/tutorial-0-ceed.ipynb @@ -92,8 +92,15 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Similarly, if libCEED is built with GPU support, you can specify a GPU backend, e.g., `/gpu/occa` or `/gpu/cuda/gen`." + "Similarly, if libCEED is built with GPU support, you can specify a GPU backend, e.g., `/gpu/hip` or `/gpu/cuda/gen`." ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { diff --git a/examples/python/tutorial-2-elemrestriction.ipynb b/examples/python/tutorial-2-elemrestriction.ipynb index c9a9483288..6c8f8593a7 100644 --- a/examples/python/tutorial-2-elemrestriction.ipynb +++ b/examples/python/tutorial-2-elemrestriction.ipynb @@ -61,20 +61,20 @@ "\n", "ceed = libceed.Ceed()\n", "\n", - "ne = 3\n", + "num_elem = 3\n", "\n", - "x = ceed.Vector(ne+1)\n", - "a = np.arange(10, 10 + ne+1, dtype=\"float64\")\n", + "x = ceed.Vector(num_elem+1)\n", + "a = np.arange(10, 10 + num_elem+1, dtype=\"float64\")\n", "x.set_array(a, cmode=libceed.USE_POINTER)\n", "\n", - "ind = np.zeros(2*ne, dtype=\"int32\")\n", - "for i in range(ne):\n", - " ind[2*i+0] = i\n", - " ind[2*i+1] = i+1\n", + "indices = np.zeros(2*num_elem, dtype=\"int32\")\n", + "for i in range(num_elem):\n", + " indices[2*i+0] = i\n", + " indices[2*i+1] = i+1\n", " \n", - "r = ceed.ElemRestriction(ne, 2, 1, 1, ne+1, ind, cmode=libceed.USE_POINTER)\n", + "r = ceed.ElemRestriction(num_elem, 2, 1, 1, num_elem+1, indices, cmode=libceed.USE_POINTER)\n", "\n", - "y = ceed.Vector(2*ne)\n", + "y = ceed.Vector(2*num_elem)\n", "y.set_value(0)\n", "\n", "r.apply(x, y)\n", @@ -100,17 +100,17 @@ "# \n", "# x -- o -- o -- x -- o -- o -- x -- o -- o -- x\n", "\n", - "ne = 3\n", + "num_elem = 3\n", "\n", - "ind = np.zeros(4*ne, dtype=\"int32\")\n", + "indices = np.zeros(4*num_elem, dtype=\"int32\")\n", "\n", - "for i in range(ne):\n", - " ind[4*i+0] = i*3+0\n", - " ind[4*i+1] = i*3+1\n", - " ind[4*i+2] = i*3+2\n", - " ind[4*i+3] = i*3+3\n", + "for i in range(num_elem):\n", + " indices[4*i+0] = i*3+0\n", + " indices[4*i+1] = i*3+1\n", + " indices[4*i+2] = i*3+2\n", + " indices[4*i+3] = i*3+3\n", "\n", - "r = ceed.ElemRestriction(ne, 4, 1, 1, 3*ne+1, ind, cmode=libceed.USE_POINTER)\n", + "r = ceed.ElemRestriction(num_elem, 4, 1, 1, 3*num_elem+1, indices, cmode=libceed.USE_POINTER)\n", "\n", "mult = r.get_multiplicity()\n", "\n", @@ -141,17 +141,17 @@ "# x -- x | x -- x | x -- x\n", "# 10 -- 11 | 12 -- 13 | 14 -- 15\n", "\n", - "ne = 3\n", + "num_elem = 3\n", "\n", - "x = ceed.Vector(2*ne)\n", - "a = np.arange(10, 10 + 2*ne, dtype=\"float64\")\n", + "x = ceed.Vector(2*num_elem)\n", + "a = np.arange(10, 10 + 2*num_elem, dtype=\"float64\")\n", "x.set_array(a, cmode=libceed.USE_POINTER)\n", "\n", "strides = np.array([1, 2, 2], dtype=\"int32\")\n", "\n", - "r = ceed.StridedElemRestriction(ne, 2, 1, 2*ne, strides)\n", + "r = ceed.StridedElemRestriction(num_elem, 2, 1, 2*num_elem, strides)\n", "\n", - "y = ceed.Vector(2*ne)\n", + "y = ceed.Vector(2*num_elem)\n", "y.set_value(0)\n", "\n", "r.apply(x, y)\n", @@ -177,11 +177,11 @@ "# \n", "# x -- x -- x -- x\n", "\n", - "ne = 3\n", + "num_elem = 3\n", "\n", "strides = np.array([1, 2, 2], dtype=\"int32\")\n", "\n", - "r = ceed.BlockedStridedElemRestriction(ne, 2, 2, 1, ne+1, strides)\n", + "r = ceed.BlockedStridedElemRestriction(num_elem, 2, 2, 1, 2*(num_elem+1), strides)\n", "\n", "print(r)" ] @@ -233,22 +233,22 @@ "# | 10-11-12-13-14 11-12-13-14-15 | 15-16-17-17-17 16-17-18-18-18 |\n", "# | e0 e1 e2 e3 e4 e0 e1 e2 e3 e4 | e0 e1 e2 e3 e4 e0 e1 e2 e3 e4 |\n", "\n", - "ne = 8\n", - "blksize = 5\n", + "num_elem = 8\n", + "block_size = 5\n", "\n", - "x = ceed.Vector(ne+1)\n", - "a = np.arange(10, 10 + ne+1, dtype=\"float64\")\n", + "x = ceed.Vector(num_elem+1)\n", + "a = np.arange(10, 10 + num_elem+1, dtype=\"float64\")\n", "x.set_array(a, cmode=libceed.USE_POINTER)\n", "\n", - "ind = np.zeros(2*ne, dtype=\"int32\")\n", - "for i in range(ne):\n", - " ind[2*i+0] = i\n", - " ind[2*i+1] = i+1\n", + "indices = np.zeros(2*num_elem, dtype=\"int32\")\n", + "for i in range(num_elem):\n", + " indices[2*i+0] = i\n", + " indices[2*i+1] = i+1\n", "\n", - "r = ceed.BlockedElemRestriction(ne, 2, blksize, 1, 1, ne+1, ind,\n", + "r = ceed.BlockedElemRestriction(num_elem, 2, block_size, 1, 1, num_elem+1, indices,\n", " cmode=libceed.USE_POINTER)\n", "\n", - "y = ceed.Vector(2*blksize*2)\n", + "y = ceed.Vector(2*block_size*2)\n", "y.set_value(0)\n", "\n", "r.apply(x, y)\n", @@ -303,22 +303,22 @@ "# | 15-16-17-17-17 16-17-18-18-18 |\n", "# | e0 e1 e2 e3 e4 e0 e1 e2 e3 e4 |\n", "\n", - "ne = 8\n", - "blksize = 5\n", + "num_elem = 8\n", + "block_size = 5\n", "\n", - "x = ceed.Vector(ne+1)\n", - "a = np.arange(10, 10 + ne+1, dtype=\"float64\")\n", + "x = ceed.Vector(num_elem+1)\n", + "a = np.arange(10, 10 + num_elem+1, dtype=\"float64\")\n", "x.set_array(a, cmode=libceed.USE_POINTER)\n", "\n", - "ind = np.zeros(2*ne, dtype=\"int32\")\n", - "for i in range(ne):\n", - " ind[2*i+0] = i\n", - " ind[2*i+1] = i+1\n", + "indices = np.zeros(2*num_elem, dtype=\"int32\")\n", + "for i in range(num_elem):\n", + " indices[2*i+0] = i\n", + " indices[2*i+1] = i+1\n", "\n", - "r = ceed.BlockedElemRestriction(ne, 2, blksize, 1, 1, ne+1, ind,\n", + "r = ceed.BlockedElemRestriction(num_elem, 2, block_size, 1, 1, num_elem+1, indices,\n", " cmode=libceed.USE_POINTER)\n", "\n", - "y = ceed.Vector(blksize*2)\n", + "y = ceed.Vector(block_size*2)\n", "y.set_value(0)\n", "\n", "r.apply_block(1, x, y)\n", @@ -343,7 +343,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -357,7 +357,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.5" + "version": "3.13.2" } }, "nbformat": 4, diff --git a/examples/python/tutorial-3-basis.ipynb b/examples/python/tutorial-3-basis.ipynb index a2141e4e9d..ef18be2789 100644 --- a/examples/python/tutorial-3-basis.ipynb +++ b/examples/python/tutorial-3-basis.ipynb @@ -63,11 +63,11 @@ " center += 0.1\n", " return result\n", "\n", - "def feval(x1, x2):\n", - " return x1*x1 + x2*x2 + x1*x2 + 1\n", + "def feval(x_1, x_2):\n", + " return x_1*x_1 + x_2*x_2 + x_1*x_2 + 1\n", "\n", - "def dfeval(x1, x2):\n", - " return 2*x1 + x2" + "def dfeval(x_1, x_2):\n", + " return 2*x_1 + x_2" ] }, { @@ -112,24 +112,24 @@ "outputs": [], "source": [ "P = b.get_num_nodes()\n", - "nviz = 50\n", - "bviz = ceed.BasisTensorH1Lagrange(1, 1, P, nviz, libceed.GAUSS_LOBATTO)\n", + "Q_viz = 50\n", + "basis_viz = ceed.BasisTensorH1Lagrange(1, 1, P, Q_viz, libceed.GAUSS_LOBATTO)\n", "\n", "# Construct P \"elements\" with one node activated\n", "I = ceed.Vector(P * P)\n", - "with I.array(P, P) as x:\n", + "with I.array_write(P, P) as x:\n", " x[...] = np.eye(P)\n", "\n", - "Bvander = ceed.Vector(P * nviz)\n", - "bviz.apply(4, libceed.EVAL_INTERP, I, Bvander)\n", + "basis_fns = ceed.Vector(P * Q_viz)\n", + "basis_viz.apply(4, libceed.EVAL_INTERP, I, basis_fns)\n", "\n", - "qviz, _weight = ceed.lobatto_quadrature(nviz)\n", - "with Bvander.array_read(nviz, P) as B:\n", - " plt.plot(qviz, B)\n", + "qpts_viz, _ = ceed.lobatto_quadrature(Q_viz)\n", + "with basis_fns.array_read(Q_viz, P) as B_array:\n", + " plt.plot(qpts_viz, B_array)\n", "\n", "# Mark tho Lobatto nodes\n", - "qb, _weight = ceed.lobatto_quadrature(P)\n", - "plt.plot(qb, 0*qb, 'ok');" + "nodes, _ = ceed.lobatto_quadrature(P)\n", + "plt.plot(nodes, 0*nodes, 'ok');" ] }, { @@ -148,11 +148,11 @@ "b = ceed.BasisTensorH1Lagrange(1, 1, 4, 4, libceed.GAUSS)\n", "print(b)\n", "\n", - "with Bvander.array_read(nviz, P) as B:\n", - " plt.plot(qviz, B)\n", + "with basis_fns.array_read(Q_viz, P) as B_array:\n", + " plt.plot(qpts_viz, B_array)\n", "# Mark tho Gauss quadrature points\n", - "qb, _weight = ceed.gauss_quadrature(P)\n", - "plt.plot(qb, 0*qb, 'ok');" + "qpts, _ = ceed.gauss_quadrature(P)\n", + "plt.plot(qpts, 0*qpts, 'ok');" ] }, { @@ -193,54 +193,52 @@ "source": [ "for dim in range(1, 4):\n", " Q = 4\n", - " Qdim = Q**dim\n", - " Xdim = 2**dim\n", - " x = np.empty(Xdim*dim, dtype=\"float64\")\n", - " uq = np.empty(Qdim, dtype=\"float64\")\n", + " Q_dim = Q**dim\n", + " X_dim = 2**dim\n", + " x = np.empty(X_dim*dim, dtype=\"float64\")\n", + " u_array = np.empty(Q_dim, dtype=\"float64\")\n", "\n", " for d in range(dim):\n", - " for i in range(Xdim):\n", - " x[d*Xdim + i] = 1 if (i % (2**(dim-d))) // (2**(dim-d-1)) else -1\n", + " for i in range(X_dim):\n", + " x[d*X_dim + i] = 1 if (i % (2**(dim-d))) // (2**(dim-d-1)) else -1\n", "\n", - " X = ceed.Vector(Xdim*dim)\n", + " X = ceed.Vector(X_dim*dim)\n", " X.set_array(x, cmode=libceed.USE_POINTER)\n", - " Xq = ceed.Vector(Qdim*dim)\n", - " Xq.set_value(0)\n", - " U = ceed.Vector(Qdim)\n", + " X_q = ceed.Vector(Q_dim*dim)\n", + " X_q.set_value(0)\n", + " U = ceed.Vector(Q_dim)\n", " U.set_value(0)\n", - " Uq = ceed.Vector(Qdim)\n", + " U_q = ceed.Vector(Q_dim)\n", "\n", - " bxl = ceed.BasisTensorH1Lagrange(dim, dim, 2, Q, libceed.GAUSS_LOBATTO)\n", - " bul = ceed.BasisTensorH1Lagrange(dim, 1, Q, Q, libceed.GAUSS_LOBATTO)\n", + " basis_x_lobatto = ceed.BasisTensorH1Lagrange(dim, dim, 2, Q, libceed.GAUSS_LOBATTO)\n", + " basis_u_lobatto = ceed.BasisTensorH1Lagrange(dim, 1, Q, Q, libceed.GAUSS_LOBATTO)\n", "\n", - " bxl.apply(1, libceed.EVAL_INTERP, X, Xq)\n", + " basis_x_lobatto.apply(1, libceed.EVAL_INTERP, X, X_q)\n", "\n", - " with Xq.array_read() as xq:\n", - " for i in range(Qdim):\n", - " xx = np.empty(dim, dtype=\"float64\")\n", + " with X_q.array_read() as x_array:\n", + " for i in range(Q_dim):\n", + " x = np.empty(dim, dtype=\"float64\")\n", " for d in range(dim):\n", - " xx[d] = xq[d*Qdim + i]\n", - " uq[i] = eval(dim, xx)\n", + " x[d] = x_array[d*Q_dim + i]\n", + " u_array[i] = eval(dim, x)\n", "\n", - " Uq.set_array(uq, cmode=libceed.USE_POINTER)\n", + " U_q.set_array(u_array, cmode=libceed.USE_POINTER)\n", "\n", " # This operation is the identity because the quadrature is collocated\n", - " bul.T.apply(1, libceed.EVAL_INTERP, Uq, U)\n", + " basis_u_lobatto.T.apply(1, libceed.EVAL_INTERP, U_q, U)\n", "\n", - " bxg = ceed.BasisTensorH1Lagrange(dim, dim, 2, Q, libceed.GAUSS)\n", - " bug = ceed.BasisTensorH1Lagrange(dim, 1, Q, Q, libceed.GAUSS)\n", + " basis_x_gauss = ceed.BasisTensorH1Lagrange(dim, dim, 2, Q, libceed.GAUSS)\n", + " basis_u_gauss = ceed.BasisTensorH1Lagrange(dim, 1, Q, Q, libceed.GAUSS)\n", "\n", - " bxg.apply(1, libceed.EVAL_INTERP, X, Xq)\n", - " bug.apply(1, libceed.EVAL_INTERP, U, Uq)\n", + " basis_x_gauss.apply(1, libceed.EVAL_INTERP, X, X_q)\n", + " basis_u_gauss.apply(1, libceed.EVAL_INTERP, U, U_q)\n", "\n", - " with Xq.array_read() as xq, Uq.array_read() as u:\n", - " #print('xq =', xq)\n", - " #print('u =', u)\n", + " with X_q.array_read() as x_array, U_q.array_read() as u_array:\n", " if dim == 2:\n", " # Default ordering is contiguous in x direction, but\n", " # pyplot expects meshgrid convention, which is transposed.\n", - " x, y = xq.reshape(2, Q, Q).transpose(0, 2, 1)\n", - " plt.scatter(x, y, c=np.array(u).reshape(Q, Q))\n", + " x, y = x_array.reshape(2, Q, Q).transpose(0, 2, 1)\n", + " plt.scatter(x, y, c=np.array(u_array).reshape(Q, Q))\n", " plt.xlim(-1, 1)\n", " plt.ylim(-1, 1)\n", " plt.colorbar(label='u')" @@ -261,62 +259,62 @@ "source": [ "for dim in range (1, 4):\n", " P, Q = 8, 10\n", - " Pdim = P**dim\n", - " Qdim = Q**dim\n", - " Xdim = 2**dim\n", - " sum1 = sum2 = 0\n", - " x = np.empty(Xdim*dim, dtype=\"float64\")\n", - " u = np.empty(Pdim, dtype=\"float64\")\n", + " P_dim = P**dim\n", + " Q_dim = Q**dim\n", + " X_dim = 2**dim\n", + " sum_1 = sum_2 = 0\n", + " x_array = np.empty(X_dim*dim, dtype=\"float64\")\n", + " u_array = np.empty(P_dim, dtype=\"float64\")\n", "\n", " for d in range(dim):\n", - " for i in range(Xdim):\n", - " x[d*Xdim + i] = 1 if (i % (2**(dim-d))) // (2**(dim-d-1)) else -1\n", - "\n", - " X = ceed.Vector(Xdim*dim)\n", - " X.set_array(x, cmode=libceed.USE_POINTER)\n", - " Xq = ceed.Vector(Pdim*dim)\n", - " Xq.set_value(0)\n", - " U = ceed.Vector(Pdim)\n", - " Uq = ceed.Vector(Qdim*dim)\n", - " Uq.set_value(0)\n", - " Ones = ceed.Vector(Qdim*dim)\n", + " for i in range(X_dim):\n", + " x_array[d*X_dim + i] = 1 if (i % (2**(dim-d))) // (2**(dim-d-1)) else -1\n", + "\n", + " X = ceed.Vector(X_dim*dim)\n", + " X.set_array(x_array, cmode=libceed.USE_POINTER)\n", + " X_q = ceed.Vector(P_dim*dim)\n", + " X_q.set_value(0)\n", + " U = ceed.Vector(P_dim)\n", + " U_q = ceed.Vector(Q_dim*dim)\n", + " U_q.set_value(0)\n", + " Ones = ceed.Vector(Q_dim*dim)\n", " Ones.set_value(1)\n", - " Gtposeones = ceed.Vector(Pdim)\n", - " Gtposeones.set_value(0)\n", + " G_transpose_ones = ceed.Vector(P_dim)\n", + " G_transpose_ones.set_value(0)\n", "\n", " # Get function values at quadrature points\n", - " bxl = ceed.BasisTensorH1Lagrange(dim, dim, 2, P, libceed.GAUSS_LOBATTO)\n", - " bxl.apply(1, libceed.EVAL_INTERP, X, Xq)\n", + " basis_x_lobatto = ceed.BasisTensorH1Lagrange(dim, dim, 2, P, libceed.GAUSS_LOBATTO)\n", + " basis_x_lobatto.apply(1, libceed.EVAL_INTERP, X, X_q)\n", "\n", - " with Xq.array_read() as xq:\n", - " for i in range(Pdim):\n", - " xx = np.empty(dim, dtype=\"float64\")\n", + " with X_q.array_read() as x_array:\n", + " for i in range(P_dim):\n", + " x = np.empty(dim, dtype=\"float64\")\n", " for d in range(dim):\n", - " xx[d] = xq[d*Pdim + i]\n", - " u[i] = eval(dim, xx)\n", + " x[d] = x_array[d*P_dim + i]\n", + " u_array[i] = eval(dim, x)\n", "\n", - " U.set_array(u, cmode=libceed.USE_POINTER)\n", + " U.set_array(u_array, cmode=libceed.USE_POINTER)\n", "\n", " # Calculate G u at quadrature points, G' * 1 at dofs\n", - " bug = ceed.BasisTensorH1Lagrange(dim, 1, P, Q, libceed.GAUSS)\n", - " bug.apply(1, libceed.EVAL_GRAD, U, Uq)\n", - " bug.T.apply(1, libceed.EVAL_GRAD, Ones, Gtposeones)\n", + " basis_u_gauss = ceed.BasisTensorH1Lagrange(dim, 1, P, Q, libceed.GAUSS)\n", + " basis_u_gauss.apply(1, libceed.EVAL_GRAD, U, U_q)\n", + " basis_u_gauss.T.apply(1, libceed.EVAL_GRAD, Ones, G_transpose_ones)\n", "\n", " # Check if 1' * G * u = u' * (G' * 1)\n", - " with Gtposeones.array_read() as gtposeones, Uq.array_read() as uq:\n", - " for i in range(Pdim):\n", - " sum1 += gtposeones[i]*u[i]\n", - " for i in range(dim*Qdim):\n", - " sum2 += uq[i]\n", + " with G_transpose_ones.array_read() as g_array, U_q.array_read() as uq_array:\n", + " for i in range(P_dim):\n", + " sum_1 += g_array[i]*u_array[i]\n", + " for i in range(dim*Q_dim):\n", + " sum_2 += uq_array[i]\n", "\n", " # Check that (1' * G * u - u' * (G' * 1)) is numerically zero\n", - " print('1T * G * u - uT * (GT * 1) =', np.abs(sum1 - sum2))" + " print('1T * G * u - uT * (GT * 1) =', np.abs(sum_1 - sum_2))" ] } ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -330,7 +328,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.5" + "version": "3.13.2" } }, "nbformat": 4, diff --git a/examples/python/tutorial-4-qfunction.ipynb b/examples/python/tutorial-4-qfunction.ipynb index d6495e241e..9aba23d2fc 100644 --- a/examples/python/tutorial-4-qfunction.ipynb +++ b/examples/python/tutorial-4-qfunction.ipynb @@ -189,7 +189,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -203,7 +203,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.5" + "version": "3.13.2" } }, "nbformat": 4, diff --git a/examples/python/tutorial-5-operator.ipynb b/examples/python/tutorial-5-operator.ipynb index 123a59836c..bb756a42a7 100644 --- a/examples/python/tutorial-5-operator.ipynb +++ b/examples/python/tutorial-5-operator.ipynb @@ -56,41 +56,41 @@ "\n", "ceed = libceed.Ceed()\n", "\n", - "nelem = 15\n", + "num_elem = 15\n", "p = 5\n", "q = 8\n", - "nx = nelem + 1\n", - "nu = nelem*(p-1) + 1\n", + "num_x = num_elem + 1\n", + "num_u = num_elem*(p-1) + 1\n", "\n", "# Vectors\n", - "x = ceed.Vector(nx)\n", - "x_array = np.zeros(nx)\n", - "for i in range(nx):\n", - " x_array[i] = i / (nx - 1.0)\n", + "x = ceed.Vector(num_x)\n", + "x_array = np.zeros(num_x)\n", + "for i in range(num_x):\n", + " x_array[i] = i / (num_x - 1.0)\n", "x.set_array(x_array, cmode=libceed.USE_POINTER)\n", "\n", - "qdata = ceed.Vector(nelem*q)\n", - "u = ceed.Vector(nu)\n", - "v = ceed.Vector(nu)\n", + "q_data = ceed.Vector(num_elem*q)\n", + "u = ceed.Vector(num_u)\n", + "v = ceed.Vector(num_u)\n", "\n", "# Restrictions\n", - "indx = np.zeros(nx*2, dtype=\"int32\")\n", - "for i in range(nx):\n", - " indx[2*i+0] = i\n", - " indx[2*i+1] = i+1\n", - "rx = ceed.ElemRestriction(nelem, 2, 1, 1, nx, indx, cmode=libceed.USE_POINTER)\n", - "\n", - "indu = np.zeros(nelem*p, dtype=\"int32\")\n", - "for i in range(nelem):\n", + "indices_x = np.zeros(num_x*2, dtype=\"int32\")\n", + "for i in range(num_x):\n", + " indices_x[2*i+0] = i\n", + " indices_x[2*i+1] = i+1\n", + "restriction_x = ceed.ElemRestriction(num_elem, 2, 1, 1, num_x, indices_x, cmode=libceed.USE_POINTER)\n", + "\n", + "indices_u = np.zeros(num_elem*p, dtype=\"int32\")\n", + "for i in range(num_elem):\n", " for j in range(p):\n", - " indu[p*i+j] = i*(p-1) + j\n", - "ru = ceed.ElemRestriction(nelem, p, 1, 1, nu, indu, cmode=libceed.USE_POINTER)\n", + " indices_u[p*i+j] = i*(p-1) + j\n", + "restriction_u = ceed.ElemRestriction(num_elem, p, 1, 1, num_u, indices_u, cmode=libceed.USE_POINTER)\n", "strides = np.array([1, q, q], dtype=\"int32\")\n", - "rui = ceed.StridedElemRestriction(nelem, q, 1, q*nelem, strides)\n", + "restriction_q_data = ceed.StridedElemRestriction(num_elem, q, 1, q*num_elem, strides)\n", "\n", "# Bases\n", - "bx = ceed.BasisTensorH1Lagrange(1, 1, 2, q, libceed.GAUSS)\n", - "bu = ceed.BasisTensorH1Lagrange(1, 1, p, q, libceed.GAUSS)\n", + "basis_x = ceed.BasisTensorH1Lagrange(1, 1, 2, q, libceed.GAUSS)\n", + "basis_u = ceed.BasisTensorH1Lagrange(1, 1, p, q, libceed.GAUSS)\n", "\n", "# QFunctions\n", "qf_setup = ceed.QFunctionByName(\"Mass1DBuild\")\n", @@ -98,24 +98,24 @@ "\n", "# Setup operator\n", "op_setup = ceed.Operator(qf_setup)\n", - "op_setup.set_field(\"dx\", rx, bx, libceed.VECTOR_ACTIVE)\n", - "op_setup.set_field(\"weights\", libceed.ELEMRESTRICTION_NONE, bx,\n", + "op_setup.set_field(\"dx\", restriction_x, basis_x, libceed.VECTOR_ACTIVE)\n", + "op_setup.set_field(\"weights\", libceed.ELEMRESTRICTION_NONE, basis_x,\n", " libceed.VECTOR_NONE)\n", - "op_setup.set_field(\"qdata\", rui, libceed.BASIS_NONE,\n", + "op_setup.set_field(\"qdata\", restriction_q_data, libceed.BASIS_NONE,\n", " libceed.VECTOR_ACTIVE)\n", "op_setup.check()\n", "print('Setup operator: ', op_setup)\n", "\n", "# Mass operator\n", "op_mass = ceed.Operator(qf_mass)\n", - "op_mass.set_field(\"u\", ru, bu, libceed.VECTOR_ACTIVE)\n", - "op_mass.set_field(\"qdata\", rui, libceed.BASIS_NONE, qdata)\n", - "op_mass.set_field(\"v\", ru, bu, libceed.VECTOR_ACTIVE)\n", + "op_mass.set_field(\"u\", restriction_u, basis_u, libceed.VECTOR_ACTIVE)\n", + "op_mass.set_field(\"qdata\", restriction_q_data, libceed.BASIS_NONE, q_data)\n", + "op_mass.set_field(\"v\", restriction_u, basis_u, libceed.VECTOR_ACTIVE)\n", "op_mass.check()\n", "print('Mass operator: ', op_mass)\n", "\n", "# Setup\n", - "op_setup.apply(x, qdata)\n", + "op_setup.apply(x, q_data)\n", "\n", "# Apply mass matrix\n", "u.set_value(1)\n", @@ -125,11 +125,103 @@ "with v.array_read() as v_array:\n", " print('The length of the domain is l = %4.2f'%np.sum(v_array))" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "* In the next example, we create and apply a CeedOperator for the Poisson operator in 1D. By applying this operator to a vector with a linear function, we compute the 'surface area' of this 1D domain, similar to Ex2-Surface in the [tutorial-6-shell tutorial](./tutorial-6-shell.ipynb)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import libceed\n", + "import numpy as np\n", + "\n", + "ceed = libceed.Ceed()\n", + "\n", + "num_elem = 15\n", + "p = 5\n", + "q = 8\n", + "num_x = num_elem + 1\n", + "num_u = num_elem*(p-1) + 1\n", + "\n", + "# Vectors\n", + "x = ceed.Vector(num_x)\n", + "x_array = np.zeros(num_x)\n", + "for i in range(num_x):\n", + " x_array[i] = i / (num_x - 1.0)\n", + "x.set_array(x_array, cmode=libceed.USE_POINTER)\n", + "\n", + "q_data = ceed.Vector(num_elem*q)\n", + "u = ceed.Vector(num_u)\n", + "v = ceed.Vector(num_u)\n", + "\n", + "# Restrictions\n", + "indices_x = np.zeros(num_x*2, dtype=\"int32\")\n", + "for i in range(num_x):\n", + " indices_x[2*i+0] = i\n", + " indices_x[2*i+1] = i+1\n", + "restriction_x = ceed.ElemRestriction(num_elem, 2, 1, 1, num_x, indices_x, cmode=libceed.USE_POINTER)\n", + "\n", + "indices_u = np.zeros(num_elem*p, dtype=\"int32\")\n", + "for i in range(num_elem):\n", + " for j in range(p):\n", + " indices_u[p*i+j] = i*(p-1) + j\n", + "restriction_u = ceed.ElemRestriction(num_elem, p, 1, 1, num_u, indices_u, cmode=libceed.USE_POINTER)\n", + "strides = np.array([1, q, q], dtype=\"int32\")\n", + "restriction_q_data = ceed.StridedElemRestriction(num_elem, q, 1, q*num_elem, strides)\n", + "\n", + "# Bases\n", + "basis_x = ceed.BasisTensorH1Lagrange(1, 1, 2, q, libceed.GAUSS)\n", + "basis_u = ceed.BasisTensorH1Lagrange(1, 1, p, q, libceed.GAUSS)\n", + "\n", + "# QFunctions\n", + "qf_setup = ceed.QFunctionByName(\"Poisson1DBuild\")\n", + "qf_mass = ceed.QFunctionByName(\"Poisson1DApply\")\n", + "\n", + "# Setup operator\n", + "op_setup = ceed.Operator(qf_setup)\n", + "op_setup.set_field(\"dx\", restriction_x, basis_x, libceed.VECTOR_ACTIVE)\n", + "op_setup.set_field(\"weights\", libceed.ELEMRESTRICTION_NONE, basis_x,\n", + " libceed.VECTOR_NONE)\n", + "op_setup.set_field(\"qdata\", restriction_q_data, libceed.BASIS_NONE,\n", + " libceed.VECTOR_ACTIVE)\n", + "op_setup.check()\n", + "print('Setup operator: ', op_setup)\n", + "\n", + "# Poisson operator\n", + "op_poisson = ceed.Operator(qf_mass)\n", + "op_poisson.set_field(\"du\", restriction_u, basis_u, libceed.VECTOR_ACTIVE)\n", + "op_poisson.set_field(\"qdata\", restriction_q_data, libceed.BASIS_NONE, q_data)\n", + "op_poisson.set_field(\"dv\", restriction_u, basis_u, libceed.VECTOR_ACTIVE)\n", + "op_poisson.check()\n", + "print('Poisson operator: ', op_poisson)\n", + "\n", + "# Setup\n", + "op_setup.apply(x, q_data)\n", + "\n", + "# Apply Poisson operator\n", + "with u.array_write() as u_array:\n", + " [points, _] = ceed.lobatto_quadrature(p)\n", + " for elem in range(num_elem):\n", + " for point in range(p):\n", + " u_array[elem * (p - 1) + point] = (1.0 + 2.0 * elem + points[point])/(2.0 * num_elem)\n", + "op_poisson.apply(u, v)\n", + "\n", + "# Check\n", + "with v.array_read() as v_array:\n", + " print('The surface area of the domain is dl = %4.2f'%np.sum(abs(v_array)))" + ] } ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -143,7 +235,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.5" + "version": "3.13.2" } }, "nbformat": 4, diff --git a/examples/rust-qfunctions/.gitignore b/examples/rust-qfunctions/.gitignore new file mode 100644 index 0000000000..f2ceaf60f1 --- /dev/null +++ b/examples/rust-qfunctions/.gitignore @@ -0,0 +1,2 @@ +ex1-volume +temp/* diff --git a/examples/rust-qfunctions/Makefile b/examples/rust-qfunctions/Makefile new file mode 100644 index 0000000000..2fba76706a --- /dev/null +++ b/examples/rust-qfunctions/Makefile @@ -0,0 +1,35 @@ +# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors +# All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +# +# SPDX-License-Identifier: BSD-2-Clause +# +# This file is part of CEED: http://github.com/ceed + +OPT ?= -O -g + +# Ceed directory +CEED_DIR ?= ../.. +CEED_FLAGS ?= -I$(CEED_DIR)/include -std=c11 $(OPT) +CEED_LIBS ?= -Wl,-rpath,$(abspath $(CEED_DIR)/lib) -L$(CEED_DIR)/lib -lceed -L$(CEED_DIR)/examples/ceed -lm + +EXAMPLES.c = $(wildcard ex*.c) +EXAMPLES = $(EXAMPLES.c:%.c=%) + +.SUFFIXES: +.SUFFIXES: .c +.PHONY: all clean + +all: $(EXAMPLES) + +# Remove built-in rules +%: %.c + +# Special build rule for example 1 (rust) +ex1-volume: ex1-volume.c + cargo +nightly build --release --manifest-path ex1-volume-rs/Cargo.toml --config ex1-volume-rs/.cargo/config.toml + $(LINK.c) $(CEED_FLAGS) $(CEED_LDFLAGS) $(abspath $<) -o $@ $(CEED_LIBS) -L$(CEED_DIR)/examples/rust-qfunctions/ex1-volume-rs/target/release -lex1_volume_rs + +clean: + rm -f *~ $(EXAMPLES) + rm -rf temp/ + rm -rf *.dSYM *.TVD.*breakpoints diff --git a/examples/rust-qfunctions/ex1-volume-rs/.cargo/config.toml b/examples/rust-qfunctions/ex1-volume-rs/.cargo/config.toml new file mode 100644 index 0000000000..ca727ba27d --- /dev/null +++ b/examples/rust-qfunctions/ex1-volume-rs/.cargo/config.toml @@ -0,0 +1,6 @@ +[target.nvptx64-nvidia-cuda] +rustflags = [ + "-C", "linker-plugin-lto", +] +[unstable] +build-std = ["panic_abort","core", "alloc"] diff --git a/examples/rust-qfunctions/ex1-volume-rs/.gitignore b/examples/rust-qfunctions/ex1-volume-rs/.gitignore new file mode 100644 index 0000000000..20a838f835 --- /dev/null +++ b/examples/rust-qfunctions/ex1-volume-rs/.gitignore @@ -0,0 +1,3 @@ +target +registry +Cargo.lock diff --git a/examples/rust-qfunctions/ex1-volume-rs/Cargo.toml b/examples/rust-qfunctions/ex1-volume-rs/Cargo.toml new file mode 100644 index 0000000000..afc2f3b200 --- /dev/null +++ b/examples/rust-qfunctions/ex1-volume-rs/Cargo.toml @@ -0,0 +1,17 @@ +[package] +name = "ex1-volume-rs" +version = "0.1.0" +edition = "2021" + +[profile.dev] +panic = "abort" + +[profile.release] +panic = "abort" + +# Compiles the crate as a lib (for GPU) and staticlib (for CPU) +[lib] +crate-type = ["staticlib"] + +[dependencies] +ndarray = {version = "0.16.1", default-features = false} diff --git a/examples/rust-qfunctions/ex1-volume-rs/rust-toolchain.toml b/examples/rust-qfunctions/ex1-volume-rs/rust-toolchain.toml new file mode 100644 index 0000000000..5d56faf9ae --- /dev/null +++ b/examples/rust-qfunctions/ex1-volume-rs/rust-toolchain.toml @@ -0,0 +1,2 @@ +[toolchain] +channel = "nightly" diff --git a/examples/rust-qfunctions/ex1-volume-rs/src/lib.rs b/examples/rust-qfunctions/ex1-volume-rs/src/lib.rs new file mode 100644 index 0000000000..8f2a36dfc9 --- /dev/null +++ b/examples/rust-qfunctions/ex1-volume-rs/src/lib.rs @@ -0,0 +1,124 @@ +#![no_std] +#![allow(internal_features)] +#![feature(asm_experimental_arch, abi_ptx, core_intrinsics)] +use core::ffi::c_void; +use core::intrinsics::abort; +use core::panic::PanicInfo; + +use ndarray::ArrayView; + +// This is a dummy allocator that always returns null. Heap allocations do not work on GPUs +use core::alloc::{GlobalAlloc, Layout}; +pub struct Allocator; +unsafe impl GlobalAlloc for Allocator { + unsafe fn alloc(&self, _layout: Layout) -> *mut u8 { + 0 as *mut u8 + } + unsafe fn dealloc(&self, _ptr: *mut u8, _layout: Layout) { + abort(); // since we never allocate + } +} +#[global_allocator] +static GLOBAL_ALLOCATOR: Allocator = Allocator; + +// This is a copy of the same data structure defined in the .h file. It can be autogenerated using bindgen/cbindgen +#[doc = " A structure used to pass additional data to f_build_mass"] +#[repr(C)] +#[derive(Debug, Copy, Clone)] +pub struct BuildContext { + pub dim: i32, + pub space_dim: i32, +} + +// On no_std targets, its required to implement your own panic function. +#[panic_handler] +fn panic(_info: &PanicInfo) -> ! { + abort() +} + +/* The no_mangle is required because rust "mangles" names (changes them to prevent namespace conflicts) +Also note that this function ends in _rs, even though the C call `CEED_QFUNCTION_RUST(build_mass)` doesn't */ +#[no_mangle] +pub unsafe extern "C" fn build_mass_rs( + ctx: *mut c_void, + q: i32, + in_: *const *const f64, + out: *mut *mut f64, +) -> i8 { + let ctx: *mut BuildContext = unsafe { core::mem::transmute(ctx) }; + let ctx: &mut BuildContext = &mut *ctx; + + let in_slice = core::slice::from_raw_parts(in_, 2); + + // in_slice[0] is Jacobians with shape [dim, dim, Q] + // in_slice[1] is quadrature weights with shape [1, Q] + let j_ptr = in_slice[0]; + let w_ptr = in_slice[1]; + + let j = ArrayView::from_shape_ptr((ctx.dim as usize, ctx.dim as usize, q as usize), j_ptr); + + let w = core::slice::from_raw_parts(w_ptr, q as usize); + + let out_slice = core::slice::from_raw_parts_mut(out, 1); + let q_data = core::slice::from_raw_parts_mut(out_slice[0], q as usize); + + match ctx.dim * 10 + ctx.space_dim { + 11 => { + // Quadrature Point Loop + for i in 0..q as usize { + q_data[i] = j[[0, 0, i]] * w[i]; + } + } + 22 => { + // Quadrature Point Loop + for i in 0..q as usize { + q_data[i] = (j[[0, 0, i]] * j[[1, 1, i]] - j[[0, 1, i]] * j[[1, 0, i]]) * w[i]; + } + } + 33 => { + // Quadrature Point Loop + for i in 0..q as usize { + q_data[i] = (j[[0, 0, i]] + * (j[[1, 1, i]] * j[[2, 2, i]] - j[[1, 2, i]] * j[[2, 1, i]]) + - j[[0, 1, i]] * (j[[1, 0, i]] * j[[2, 2, i]] - j[[1, 2, i]] * j[[2, 0, i]]) + + j[[0, 2, i]] * (j[[1, 0, i]] * j[[2, 1, i]] - j[[1, 1, i]] * j[[2, 0, i]])) + * w[i]; + } + } + _ => { + abort(); + } + } + + 0 +} + +/* The no_mangle is required because rust "mangles" names (changes them to prevent namespace conflicts) +Also note that this function ends in _rs, even though the C call `CEED_QFUNCTION_RUST(apply_mass)` doesn't +For FFI reasons, it is also required to include all parameters in this exact form, even if you don't use all of them*/ +#[no_mangle] +pub unsafe extern "C" fn apply_mass_rs( + _ctx: *mut c_void, + q: i32, + in_: *const *const f64, + out: *mut *mut f64, +) -> i8 { + let in_slice = core::slice::from_raw_parts(in_, 2); + + let u_ptr = in_slice[0]; + let q_data_ptr = in_slice[1]; + + let u = core::slice::from_raw_parts(u_ptr, q as usize); + let q_data = core::slice::from_raw_parts(q_data_ptr, q as usize); + + let out_slice = core::slice::from_raw_parts_mut(out, 1); + + let v_ptr = out_slice[0]; + let v = core::slice::from_raw_parts_mut(v_ptr, q as usize); + + for i in 0..q as usize { + v[i] = q_data[i] * u[i]; + } + + 0 +} diff --git a/examples/rust-qfunctions/ex1-volume.c b/examples/rust-qfunctions/ex1-volume.c new file mode 100644 index 0000000000..906ab1aff7 --- /dev/null +++ b/examples/rust-qfunctions/ex1-volume.c @@ -0,0 +1,439 @@ +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. +// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +// +// SPDX-License-Identifier: BSD-2-Clause +// +// This file is part of CEED: http://github.com/ceed + +// libCEED Example 1 +// +// This example illustrates a simple usage of libCEED to compute the volume of a 3D body using matrix-free application of a mass operator. +// Arbitrary mesh and solution degrees in 1D, 2D and 3D are supported from the same code. +// +// The example has no dependencies, and is designed to be self-contained. +// For additional examples that use external discretization libraries (MFEM, PETSc, etc.) see the subdirectories in libceed/examples. +// +// All libCEED objects use a Ceed device object constructed based on a command line argument (-ceed). +// +// Build with: +// +// make ex1-volume-rust [CEED_DIR=] +// +// Sample runs: +// +// ./ex1-volume +// ./ex1-volume -ceed /cpu/self +// ./ex1-volume -ceed /gpu/cuda +// +// Test in 1D-3D +//TESTARGS(name="1D User QFunction") -ceed {ceed_resource} -d 1 -t +//TESTARGS(name="2D User QFunction") -ceed {ceed_resource} -d 2 -t +//TESTARGS(name="3D User QFunction") -ceed {ceed_resource} -d 3 -t +//TESTARGS(name="1D Gallery QFunction") -ceed {ceed_resource} -d 1 -t -g +//TESTARGS(name="2D Gallery QFunction") -ceed {ceed_resource} -d 2 -t -g +//TESTARGS(name="3D Gallery QFunction") -ceed {ceed_resource} -d 3 -t -g + +/// @file +/// libCEED example using mass operator to compute volume + +#include "ex1-volume.h" + +#include +#include +#include +#include +#include +#include + +// Auxiliary functions +int GetCartesianMeshSize(CeedInt dim, CeedInt degree, CeedInt prob_size, CeedInt num_xyz[dim]); +int BuildCartesianRestriction(Ceed ceed, CeedInt dim, CeedInt num_xyz[dim], CeedInt degree, CeedInt num_comp, CeedInt *size, CeedInt num_qpts, + CeedElemRestriction *restriction, CeedElemRestriction *q_data_restriction); +int SetCartesianMeshCoords(CeedInt dim, CeedInt num_xyz[dim], CeedInt mesh_degree, CeedVector mesh_coords); +CeedScalar TransformMeshCoords(CeedInt dim, CeedInt mesh_size, CeedVector mesh_coords); + +// Main example +int main(int argc, const char *argv[]) { + const char *ceed_spec = "/cpu/self"; + CeedInt dim = 3; // dimension of the mesh + CeedInt num_comp_x = 3; // number of x components + CeedInt mesh_degree = 4; // polynomial degree for the mesh + CeedInt sol_degree = 4; // polynomial degree for the solution + CeedInt num_qpts = sol_degree + 2; // number of 1D quadrature points + CeedInt prob_size = -1; // approximate problem size + CeedInt help = 0, test = 0, gallery = 0, benchmark = 0; + + // Process command line arguments. + for (int ia = 1; ia < argc; ia++) { + // LCOV_EXCL_START + int next_arg = ((ia + 1) < argc), parse_error = 0; + if (!strcmp(argv[ia], "-h")) { + help = 1; + } else if (!strcmp(argv[ia], "-c") || !strcmp(argv[ia], "-ceed")) { + parse_error = next_arg ? ceed_spec = argv[++ia], 0 : 1; + } else if (!strcmp(argv[ia], "-d")) { + parse_error = next_arg ? dim = atoi(argv[++ia]), 0 : 1; + num_comp_x = dim; + } else if (!strcmp(argv[ia], "-m")) { + parse_error = next_arg ? mesh_degree = atoi(argv[++ia]), 0 : 1; + } else if (!strcmp(argv[ia], "-p")) { + parse_error = next_arg ? sol_degree = atoi(argv[++ia]), 0 : 1; + } else if (!strcmp(argv[ia], "-q")) { + parse_error = next_arg ? num_qpts = atoi(argv[++ia]), 0 : 1; + } else if (!strcmp(argv[ia], "-s")) { + parse_error = next_arg ? prob_size = atoi(argv[++ia]), 0 : 1; + } else if (!strcmp(argv[ia], "-b")) { + parse_error = next_arg ? benchmark = atoi(argv[++ia]), 0 : 1; + } else if (!strcmp(argv[ia], "-t")) { + test = 1; + } else if (!strcmp(argv[ia], "-g")) { + gallery = 1; + } + if (parse_error) { + printf("Error parsing command line options.\n"); + return 1; + } + // LCOV_EXCL_STOP + } + if (prob_size < 0) prob_size = test ? 8 * 16 : 256 * 1024; + + // Print the values of all options: + if (!test || help) { + // LCOV_EXCL_START + printf("Selected options: [command line option] : \n"); + printf(" Ceed specification [-c] : %s\n", ceed_spec); + printf(" Mesh dimension [-d] : %" CeedInt_FMT "\n", dim); + printf(" Mesh degree [-m] : %" CeedInt_FMT "\n", mesh_degree); + printf(" Solution degree [-p] : %" CeedInt_FMT "\n", sol_degree); + printf(" Num. 1D quadrature pts [-q] : %" CeedInt_FMT "\n", num_qpts); + printf(" Approx. # unknowns [-s] : %" CeedInt_FMT "\n", prob_size); + printf(" QFunction source [-g] : %s\n", gallery ? "gallery" : "header"); + if (help) { + printf("Test/quiet mode is %s\n", (test ? "ON" : "OFF (use -t to enable)")); + return 0; + } + printf("\n"); + // LCOV_EXCL_STOP + } + + // Select appropriate backend and logical device based on the (-ceed) command line argument. + Ceed ceed; + + CeedInit(ceed_spec, &ceed); + + // Add the path to the Rust crate to the ceed object. + { + char root[2048] = __FILE__; + char *last_slash = strrchr(root, '/'); + + strncpy(last_slash + 1, "ex1-volume-rs", 14); + CeedAddRustSourceRoot(ceed, root); + } + + // Construct the mesh and solution bases. + CeedBasis mesh_basis, sol_basis; + + CeedBasisCreateTensorH1Lagrange(ceed, dim, num_comp_x, mesh_degree + 1, num_qpts, CEED_GAUSS, &mesh_basis); + CeedBasisCreateTensorH1Lagrange(ceed, dim, 1, sol_degree + 1, num_qpts, CEED_GAUSS, &sol_basis); + + // Determine the mesh size based on the given approximate problem size. + CeedInt num_xyz[dim]; + + GetCartesianMeshSize(dim, sol_degree, prob_size, num_xyz); + if (!test) { + // LCOV_EXCL_START + printf("Mesh size: nx = %" CeedInt_FMT, num_xyz[0]); + if (dim > 1) printf(", ny = %" CeedInt_FMT, num_xyz[1]); + if (dim > 2) printf(", nz = %" CeedInt_FMT, num_xyz[2]); + printf("\n"); + // LCOV_EXCL_STOP + } + + // Build CeedElemRestriction objects describing the mesh and solution discrete representations. + CeedInt mesh_size, sol_size; + CeedElemRestriction mesh_restriction, sol_restriction, q_data_restriction; + + BuildCartesianRestriction(ceed, dim, num_xyz, mesh_degree, num_comp_x, &mesh_size, num_qpts, &mesh_restriction, NULL); + BuildCartesianRestriction(ceed, dim, num_xyz, sol_degree, 1, &sol_size, num_qpts, &sol_restriction, &q_data_restriction); + if (!test) { + // LCOV_EXCL_START + printf("Number of mesh nodes : %" CeedInt_FMT "\n", mesh_size / dim); + printf("Number of solution nodes : %" CeedInt_FMT "\n", sol_size); + // LCOV_EXCL_STOP + } + + // Create a CeedVector with the mesh coordinates. + CeedVector mesh_coords; + + CeedVectorCreate(ceed, mesh_size, &mesh_coords); + SetCartesianMeshCoords(dim, num_xyz, mesh_degree, mesh_coords); + + // Apply a transformation to the mesh. + CeedScalar exact_volume = TransformMeshCoords(dim, mesh_size, mesh_coords); + + // Context data to be passed to the 'build_mass' QFunction. + CeedQFunctionContext build_ctx; + struct BuildContext build_ctx_data; + + build_ctx_data.dim = build_ctx_data.space_dim = dim; + CeedQFunctionContextCreate(ceed, &build_ctx); + CeedQFunctionContextSetData(build_ctx, CEED_MEM_HOST, CEED_USE_POINTER, sizeof(build_ctx_data), &build_ctx_data); + + // Create the QFunction that builds the mass operator (i.e. computes its quadrature data) and set its context data. + CeedQFunction qf_build; + + if (gallery) { + // This creates the QFunction via the gallery. + char name[13] = ""; + snprintf(name, sizeof name, "Mass%" CeedInt_FMT "DBuild", dim); + CeedQFunctionCreateInteriorByName(ceed, name, &qf_build); + } else { + // This creates the QFunction directly. + CeedQFunctionCreateInterior(ceed, 1, build_mass, build_mass_loc, &qf_build); + CeedQFunctionAddInput(qf_build, "dx", num_comp_x * dim, CEED_EVAL_GRAD); + CeedQFunctionAddInput(qf_build, "weights", 1, CEED_EVAL_WEIGHT); + CeedQFunctionAddOutput(qf_build, "qdata", 1, CEED_EVAL_NONE); + CeedQFunctionSetContext(qf_build, build_ctx); + } + + // Create the operator that builds the quadrature data for the mass operator. + CeedOperator op_build; + + CeedOperatorCreate(ceed, qf_build, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_build); + CeedOperatorSetField(op_build, "dx", mesh_restriction, mesh_basis, CEED_VECTOR_ACTIVE); + CeedOperatorSetField(op_build, "weights", CEED_ELEMRESTRICTION_NONE, mesh_basis, CEED_VECTOR_NONE); + CeedOperatorSetField(op_build, "qdata", q_data_restriction, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE); + + // Compute the quadrature data for the mass operator. + CeedVector q_data; + CeedInt elem_qpts = CeedIntPow(num_qpts, dim); + CeedInt num_elem = 1; + + for (CeedInt d = 0; d < dim; d++) num_elem *= num_xyz[d]; + CeedVectorCreate(ceed, num_elem * elem_qpts, &q_data); + CeedOperatorApply(op_build, mesh_coords, q_data, CEED_REQUEST_IMMEDIATE); + + // Create the QFunction that defines the action of the mass operator. + CeedQFunction qf_apply; + + if (gallery) { + // This creates the QFunction via the gallery. + CeedQFunctionCreateInteriorByName(ceed, "MassApply", &qf_apply); + } else { + // This creates the QFunction directly. + CeedQFunctionCreateInterior(ceed, 1, apply_mass, apply_mass_loc, &qf_apply); + CeedQFunctionAddInput(qf_apply, "u", 1, CEED_EVAL_INTERP); + CeedQFunctionAddInput(qf_apply, "qdata", 1, CEED_EVAL_NONE); + CeedQFunctionAddOutput(qf_apply, "v", 1, CEED_EVAL_INTERP); + } + + // Create the mass operator. + CeedOperator op_apply; + + CeedOperatorCreate(ceed, qf_apply, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_apply); + CeedOperatorSetField(op_apply, "u", sol_restriction, sol_basis, CEED_VECTOR_ACTIVE); + CeedOperatorSetField(op_apply, "qdata", q_data_restriction, CEED_BASIS_NONE, q_data); + CeedOperatorSetField(op_apply, "v", sol_restriction, sol_basis, CEED_VECTOR_ACTIVE); + + // Create auxiliary solution-size vectors. + CeedVector u, v; + + CeedVectorCreate(ceed, sol_size, &u); + CeedVectorCreate(ceed, sol_size, &v); + + // Initialize 'u' with ones. + CeedVectorSetValue(u, 1.0); + + // Compute the mesh volume using the mass operator: volume = 1^T \cdot M \cdot 1 + CeedOperatorApply(op_apply, u, v, CEED_REQUEST_IMMEDIATE); + + // Benchmark runs + if (!test && benchmark) { + // LCOV_EXCL_START + printf(" Executing %d benchmarking runs...\n", benchmark); + // LCOV_EXCL_STOP + } + for (CeedInt i = 0; i < benchmark; i++) { + // LCOV_EXCL_START + CeedOperatorApply(op_apply, u, v, CEED_REQUEST_IMMEDIATE); + // LCOV_EXCL_STOP + } + + // Compute and print the sum of the entries of 'v' giving the mesh volume. + CeedScalar volume = 0.; + + { + const CeedScalar *v_array; + + CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array); + for (CeedInt i = 0; i < sol_size; i++) volume += v_array[i]; + CeedVectorRestoreArrayRead(v, &v_array); + } + if (!test) { + // LCOV_EXCL_START + printf(" done.\n"); + printf("Exact mesh volume : % .14g\n", exact_volume); + printf("Computed mesh volume : % .14g\n", volume); + printf("Volume error : % .14g\n", volume - exact_volume); + // LCOV_EXCL_STOP + } else { + CeedScalar tol = (dim == 1 ? 200. * CEED_EPSILON : dim == 2 ? 1E-5 : 1E-5); + + if (fabs(volume - exact_volume) > tol) printf("Volume error : % .1e\n", volume - exact_volume); + } + + // Free dynamically allocated memory. + CeedVectorDestroy(&u); + CeedVectorDestroy(&v); + CeedVectorDestroy(&q_data); + CeedVectorDestroy(&mesh_coords); + CeedOperatorDestroy(&op_apply); + CeedQFunctionDestroy(&qf_apply); + CeedQFunctionContextDestroy(&build_ctx); + CeedOperatorDestroy(&op_build); + CeedQFunctionDestroy(&qf_build); + CeedElemRestrictionDestroy(&sol_restriction); + CeedElemRestrictionDestroy(&mesh_restriction); + CeedElemRestrictionDestroy(&q_data_restriction); + CeedBasisDestroy(&sol_basis); + CeedBasisDestroy(&mesh_basis); + CeedDestroy(&ceed); + return 0; +} + +int GetCartesianMeshSize(CeedInt dim, CeedInt degree, CeedInt prob_size, CeedInt num_xyz[dim]) { + // Use the approximate formula: + // prob_size ~ num_elem * degree^dim + CeedInt num_elem = prob_size / CeedIntPow(degree, dim); + CeedInt s = 0; // find s: num_elem/2 < 2^s <= num_elem + + while (num_elem > 1) { + num_elem /= 2; + s++; + } + CeedInt r = s % dim; + + for (CeedInt d = 0; d < dim; d++) { + CeedInt sd = s / dim; + + if (r > 0) { + sd++; + r--; + } + num_xyz[d] = 1 << sd; + } + return 0; +} + +int BuildCartesianRestriction(Ceed ceed, CeedInt dim, CeedInt num_xyz[dim], CeedInt degree, CeedInt num_comp, CeedInt *size, CeedInt num_qpts, + CeedElemRestriction *restriction, CeedElemRestriction *q_data_restriction) { + CeedInt p = degree + 1; + CeedInt num_nodes = CeedIntPow(p, dim); // number of scalar nodes per element + CeedInt elem_qpts = CeedIntPow(num_qpts, dim); // number of qpts per element + CeedInt nd[3], num_elem = 1, scalar_size = 1; + + for (CeedInt d = 0; d < dim; d++) { + num_elem *= num_xyz[d]; + nd[d] = num_xyz[d] * (p - 1) + 1; + scalar_size *= nd[d]; + } + *size = scalar_size * num_comp; + // elem: 0 1 n-1 + // |---*-...-*---|---*-...-*---|- ... -|--...--| + // num_nodes: 0 1 p-1 p p+1 2*p n*p + CeedInt *elem_nodes = malloc(sizeof(CeedInt) * num_elem * num_nodes); + + for (CeedInt e = 0; e < num_elem; e++) { + CeedInt e_xyz[3] = {1, 1, 1}, re = e; + + for (CeedInt d = 0; d < dim; d++) { + e_xyz[d] = re % num_xyz[d]; + re /= num_xyz[d]; + } + CeedInt *local_elem_nodes = elem_nodes + e * num_nodes; + + for (CeedInt l_nodes = 0; l_nodes < num_nodes; l_nodes++) { + CeedInt g_nodes = 0, g_nodes_stride = 1, r_nodes = l_nodes; + + for (CeedInt d = 0; d < dim; d++) { + g_nodes += (e_xyz[d] * (p - 1) + r_nodes % p) * g_nodes_stride; + g_nodes_stride *= nd[d]; + r_nodes /= p; + } + local_elem_nodes[l_nodes] = g_nodes; + } + } + CeedElemRestrictionCreate(ceed, num_elem, num_nodes, num_comp, scalar_size, num_comp * scalar_size, CEED_MEM_HOST, CEED_COPY_VALUES, elem_nodes, + restriction); + if (q_data_restriction) { + CeedElemRestrictionCreateStrided(ceed, num_elem, elem_qpts, num_comp, num_comp * elem_qpts * num_elem, CEED_STRIDES_BACKEND, q_data_restriction); + } + free(elem_nodes); + return 0; +} + +int SetCartesianMeshCoords(CeedInt dim, CeedInt num_xyz[dim], CeedInt mesh_degree, CeedVector mesh_coords) { + CeedInt p = mesh_degree + 1; + CeedInt nd[3], scalar_size = 1; + + for (CeedInt d = 0; d < dim; d++) { + nd[d] = num_xyz[d] * (p - 1) + 1; + scalar_size *= nd[d]; + } + CeedScalar *coords; + + CeedVectorGetArrayWrite(mesh_coords, CEED_MEM_HOST, &coords); + CeedScalar *nodes = malloc(sizeof(CeedScalar) * p); + + // The H1 basis uses Lobatto quadrature points as nodes. + CeedLobattoQuadrature(p, nodes, NULL); // nodes are in [-1,1] + for (CeedInt i = 0; i < p; i++) nodes[i] = 0.5 + 0.5 * nodes[i]; + for (CeedInt gs_nodes = 0; gs_nodes < scalar_size; gs_nodes++) { + CeedInt r_nodes = gs_nodes; + + for (CeedInt d = 0; d < dim; d++) { + CeedInt d_1d = r_nodes % nd[d]; + + coords[gs_nodes + scalar_size * d] = ((d_1d / (p - 1)) + nodes[d_1d % (p - 1)]) / num_xyz[d]; + r_nodes /= nd[d]; + } + } + free(nodes); + CeedVectorRestoreArray(mesh_coords, &coords); + return 0; +} + +#ifndef M_PI +#define M_PI 3.14159265358979323846 +#define M_PI_2 1.57079632679489661923 +#endif + +CeedScalar TransformMeshCoords(CeedInt dim, CeedInt mesh_size, CeedVector mesh_coords) { + CeedScalar exact_volume; + CeedScalar *coords; + + CeedVectorGetArray(mesh_coords, CEED_MEM_HOST, &coords); + if (dim == 1) { + for (CeedInt i = 0; i < mesh_size; i++) { + // map [0,1] to [0,1] varying the mesh density + coords[i] = 0.5 + 1. / sqrt(3.) * sin((2. / 3.) * M_PI * (coords[i] - 0.5)); + } + exact_volume = 1.; + } else { + CeedInt num_nodes = mesh_size / dim; + + for (CeedInt i = 0; i < num_nodes; i++) { + // map (x,y) from [0,1]x[0,1] to the quarter annulus with polar + // coordinates, (r,phi) in [1,2]x[0,pi/2] with area = 3/4*pi + CeedScalar u = coords[i], v = coords[i + num_nodes]; + + u = 1. + u; + v = M_PI_2 * v; + coords[i] = u * cos(v); + coords[i + num_nodes] = u * sin(v); + } + exact_volume = 3. / 4. * M_PI; + } + CeedVectorRestoreArray(mesh_coords, &coords); + return exact_volume; +} diff --git a/examples/rust-qfunctions/ex1-volume.h b/examples/rust-qfunctions/ex1-volume.h new file mode 100644 index 0000000000..1c2baa8fc2 --- /dev/null +++ b/examples/rust-qfunctions/ex1-volume.h @@ -0,0 +1,19 @@ +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. +// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +// +// SPDX-License-Identifier: BSD-2-Clause +// +// This file is part of CEED: http://github.com/ceed + +#include + +/// A structure used to pass additional data to f_build_mass +struct BuildContext { + CeedInt dim, space_dim; +}; + +// References the rust file for the qfunction named build_mass_rs +CEED_QFUNCTION_RUST(build_mass) + +// References the rust file for the qfunction named apply_mass_rs +CEED_QFUNCTION_RUST(apply_mass) diff --git a/examples/rust/ex3-vector-volume/.gitignore b/examples/rust/ex1-volume-vector/.gitignore similarity index 100% rename from examples/rust/ex3-vector-volume/.gitignore rename to examples/rust/ex1-volume-vector/.gitignore diff --git a/examples/rust/ex3-vector-volume/Cargo.toml b/examples/rust/ex1-volume-vector/Cargo.toml similarity index 91% rename from examples/rust/ex3-vector-volume/Cargo.toml rename to examples/rust/ex1-volume-vector/Cargo.toml index 3bee448ac7..d3f5b74832 100644 --- a/examples/rust/ex3-vector-volume/Cargo.toml +++ b/examples/rust/ex1-volume-vector/Cargo.toml @@ -1,5 +1,5 @@ [package] -name = "ex3-vector-volume" +name = "ex1-volume-vector" version = "0.11.0" authors = [ "Jeremy L Thompson ", diff --git a/examples/rust/ex3-vector-volume/src/main.rs b/examples/rust/ex1-volume-vector/src/main.rs similarity index 87% rename from examples/rust/ex3-vector-volume/src/main.rs rename to examples/rust/ex1-volume-vector/src/main.rs index 9b3cced2b8..85921e688d 100644 --- a/examples/rust/ex3-vector-volume/src/main.rs +++ b/examples/rust/ex1-volume-vector/src/main.rs @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -20,20 +20,23 @@ // line argument (-ceed). use clap::Parser; -use libceed::{prelude::*, Ceed}; +use libceed::{ + BasisOpt, Ceed, ElemRestrictionOpt, QFunctionInputs, QFunctionOpt, QFunctionOutputs, VectorOpt, +}; mod opt; mod transform; // ---------------------------------------------------------------------------- // Example 3 // ---------------------------------------------------------------------------- -#[cfg(not(tarpaulin_include))] fn main() -> libceed::Result<()> { let options = opt::Opt::parse(); - example_3(options) + example_1_vector(options) } -fn example_3(options: opt::Opt) -> libceed::Result<()> { +#[allow(clippy::erasing_op)] +#[allow(clippy::identity_op)] +fn example_1_vector(options: opt::Opt) -> libceed::Result<()> { // Process command line arguments let opt::Opt { ceed_spec, @@ -46,17 +49,20 @@ fn example_3(options: opt::Opt) -> libceed::Result<()> { quiet, gallery, } = options; - assert!(dim >= 1 && dim <= 3); + assert!((0..=3).contains(&dim)); assert!(mesh_degree >= 1); assert!(solution_degree >= 1); assert!(num_qpts >= 1); let ncomp_x = dim; - let problem_size: i64; - if problem_size_requested < 0 { - problem_size = if test { 8 * 16 } else { 256 * 1024 }; + let problem_size: i64 = if problem_size_requested < 0 { + if test { + 8 * 16 + } else { + 256 * 1024 + } } else { - problem_size = problem_size_requested; - } + problem_size_requested + }; let ncomp_u = 3; // Summary output @@ -78,14 +84,19 @@ fn example_3(options: opt::Opt) -> libceed::Result<()> { let ceed = Ceed::init(&ceed_spec); // Mesh and solution bases - let basis_mesh = - ceed.basis_tensor_H1_Lagrange(dim, ncomp_x, mesh_degree + 1, num_qpts, QuadMode::Gauss)?; + let basis_mesh = ceed.basis_tensor_H1_Lagrange( + dim, + ncomp_x, + mesh_degree + 1, + num_qpts, + libceed::QuadMode::Gauss, + )?; let basis_solution = ceed.basis_tensor_H1_Lagrange( dim, ncomp_u, solution_degree + 1, num_qpts, - QuadMode::Gauss, + libceed::QuadMode::Gauss, )?; // Determine mesh size from approximate problem size @@ -98,7 +109,7 @@ fn example_3(options: opt::Opt) -> libceed::Result<()> { if dim > 2 { print!(", nz = {}", num_xyz[2]); } - print!("\n"); + println!(); } // Build ElemRestriction objects describing the mesh and solution discrete @@ -167,9 +178,9 @@ fn example_3(options: opt::Opt) -> libceed::Result<()> { }; let qf_build_closure = ceed .q_function_interior(1, Box::new(build_mass))? - .input("dx", ncomp_x * dim, EvalMode::Grad)? - .input("weights", 1, EvalMode::Weight)? - .output("qdata", 1, EvalMode::None)?; + .input("dx", ncomp_x * dim, libceed::EvalMode::Grad)? + .input("weights", 1, libceed::EvalMode::Weight)? + .output("qdata", 1, libceed::EvalMode::None)?; // -- QFunction from gallery let qf_build_named = { let name = format!("Mass{}DBuild", dim); @@ -218,9 +229,9 @@ fn example_3(options: opt::Opt) -> libceed::Result<()> { }; let qf_mass_closure = ceed .q_function_interior(1, Box::new(apply_mass))? - .input("u", ncomp_u, EvalMode::Interp)? - .input("qdata", 1, EvalMode::None)? - .output("v", ncomp_u, EvalMode::Interp)?; + .input("u", ncomp_u, libceed::EvalMode::Interp)? + .input("qdata", 1, libceed::EvalMode::None)? + .output("v", ncomp_u, libceed::EvalMode::Interp)?; // -- QFunction from gallery let qf_mass_named = ceed.q_function_interior_by_name("Vector3MassApply")?; // -- QFunction for use with Operator @@ -256,7 +267,7 @@ fn example_3(options: opt::Opt) -> libceed::Result<()> { op_mass.apply(&u, &mut v)?; // Compute the mesh volume - let volume: Scalar = v.view()?.iter().sum::() + let volume: libceed::Scalar = v.view()?.iter().sum::() / ((ncomp_u * (ncomp_u + 1)) / 2) as libceed::Scalar; // Output results @@ -269,7 +280,7 @@ fn example_3(options: opt::Opt) -> libceed::Result<()> { ); } let tolerance = match dim { - 1 => 100.0 * libceed::EPSILON, + 1 => 200.0 * libceed::EPSILON, _ => 1E-5, }; let error = (volume - exact_volume).abs(); @@ -293,7 +304,7 @@ mod tests { use super::*; #[test] - fn example_3_1d() { + fn example_1_vector_1d() { let options = opt::Opt { ceed_spec: "/cpu/self/ref/serial".to_string(), dim: 1, @@ -305,11 +316,11 @@ mod tests { quiet: true, gallery: false, }; - assert!(example_3(options).is_ok()); + assert!(example_1_vector(options).is_ok()); } #[test] - fn example_3_2d() { + fn example_1_vector_2d() { let options = opt::Opt { ceed_spec: "/cpu/self/ref/serial".to_string(), dim: 2, @@ -321,11 +332,11 @@ mod tests { quiet: true, gallery: false, }; - assert!(example_3(options).is_ok()); + assert!(example_1_vector(options).is_ok()); } #[test] - fn example_3_3d() { + fn example_1_vector_3d() { let options = opt::Opt { ceed_spec: "/cpu/self/ref/serial".to_string(), dim: 3, @@ -337,11 +348,11 @@ mod tests { quiet: false, gallery: false, }; - assert!(example_3(options).is_ok()); + assert!(example_1_vector(options).is_ok()); } #[test] - fn example_3_1d_gallery() { + fn example_1_vector_1d_gallery() { let options = opt::Opt { ceed_spec: "/cpu/self/ref/serial".to_string(), dim: 1, @@ -353,11 +364,11 @@ mod tests { quiet: true, gallery: true, }; - assert!(example_3(options).is_ok()); + assert!(example_1_vector(options).is_ok()); } #[test] - fn example_3_2d_gallery() { + fn example_1_vector_2d_gallery() { let options = opt::Opt { ceed_spec: "/cpu/self/ref/serial".to_string(), dim: 2, @@ -369,11 +380,11 @@ mod tests { quiet: true, gallery: true, }; - assert!(example_3(options).is_ok()); + assert!(example_1_vector(options).is_ok()); } #[test] - fn example_3_3d_gallery() { + fn example_1_vector_3d_gallery() { let options = opt::Opt { ceed_spec: "/cpu/self/ref/serial".to_string(), dim: 3, @@ -385,7 +396,7 @@ mod tests { quiet: true, gallery: true, }; - assert!(example_3(options).is_ok()); + assert!(example_1_vector(options).is_ok()); } } diff --git a/examples/rust/ex3-vector-volume/src/opt.rs b/examples/rust/ex1-volume-vector/src/opt.rs similarity index 95% rename from examples/rust/ex3-vector-volume/src/opt.rs rename to examples/rust/ex1-volume-vector/src/opt.rs index 5de7c68f08..37cbf0a3c0 100644 --- a/examples/rust/ex3-vector-volume/src/opt.rs +++ b/examples/rust/ex1-volume-vector/src/opt.rs @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -15,7 +15,6 @@ use clap::Parser; name = "libCEED Rust Example 3 - Vector Volume", about = "This example uses the mass matrix to compute the length, area, or volume of a region in triplicate, depending upon runtime parameters." )] -#[cfg(not(tarpaulin_include))] pub(crate) struct Opt { /// libCEED backend resource to use #[arg(name = "CEED", short, long = "ceed", default_value = "/cpu/self")] diff --git a/examples/rust/ex3-vector-volume/src/transform.rs b/examples/rust/ex1-volume-vector/src/transform.rs similarity index 72% rename from examples/rust/ex3-vector-volume/src/transform.rs rename to examples/rust/ex1-volume-vector/src/transform.rs index 6ebe14bc6f..7073937353 100644 --- a/examples/rust/ex3-vector-volume/src/transform.rs +++ b/examples/rust/ex1-volume-vector/src/transform.rs @@ -1,27 +1,25 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause // // This file is part of CEED: http://github.com/ceed -use libceed::prelude::*; - // ---------------------------------------------------------------------------- // Transform mesh coordinates // ---------------------------------------------------------------------------- pub(crate) fn transform_mesh_coordinates( dim: usize, mesh_size: usize, - mesh_coords: &mut Vector, -) -> libceed::Result { + mesh_coords: &mut libceed::Vector, +) -> libceed::Result { // Transform coordinates if dim == 1 { for coord in mesh_coords.view_mut()?.iter_mut() { // map [0,1] to [0,1] varying the mesh density *coord = 0.5 - + 1.0 / (3.0 as Scalar).sqrt() - * ((2.0 / 3.0) * std::f64::consts::PI as Scalar * (*coord - 0.5)).sin() + + 1.0 / (3.0 as libceed::Scalar).sqrt() + * ((2.0 / 3.0) * std::f64::consts::PI as libceed::Scalar * (*coord - 0.5)).sin() } } else { let mut coords = mesh_coords.view_mut()?; @@ -30,7 +28,7 @@ pub(crate) fn transform_mesh_coordinates( // map (x,y) from [0,1]x[0,1] to the quarter annulus with polar // coordinates, (r,phi) in [1,2]x[0,pi/2] with area = 3/4*pi let u = 1.0 + coords[i]; - let v = std::f64::consts::PI as Scalar / 2.0 * coords[i + num_nodes]; + let v = std::f64::consts::PI as libceed::Scalar / 2.0 * coords[i + num_nodes]; coords[i] = u * v.cos(); coords[i + num_nodes] = u * v.sin(); } @@ -39,7 +37,8 @@ pub(crate) fn transform_mesh_coordinates( // Exact volume of transformed region let exact_volume = match dim { 1 => 1.0, - _ => 3.0 / 4.0 * std::f64::consts::PI as Scalar, + 2 | 3 => 3.0 / 4.0 * std::f64::consts::PI as libceed::Scalar, + _ => unreachable!(), }; Ok(exact_volume) } diff --git a/examples/rust/ex1-volume/src/main.rs b/examples/rust/ex1-volume/src/main.rs index bea2e2f79c..9020fb270c 100644 --- a/examples/rust/ex1-volume/src/main.rs +++ b/examples/rust/ex1-volume/src/main.rs @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -19,19 +19,22 @@ // line argument (-ceed). use clap::Parser; -use libceed::{prelude::*, Ceed}; +use libceed::{ + BasisOpt, Ceed, ElemRestrictionOpt, QFunctionInputs, QFunctionOpt, QFunctionOutputs, VectorOpt, +}; mod opt; mod transform; // ---------------------------------------------------------------------------- // Example 1 // ---------------------------------------------------------------------------- -#[cfg(not(tarpaulin_include))] fn main() -> libceed::Result<()> { let options = opt::Opt::parse(); example_1(options) } +#[allow(clippy::erasing_op)] +#[allow(clippy::identity_op)] fn example_1(options: opt::Opt) -> libceed::Result<()> { // Process command line arguments let opt::Opt { @@ -45,17 +48,20 @@ fn example_1(options: opt::Opt) -> libceed::Result<()> { quiet, gallery, } = options; - assert!(dim >= 1 && dim <= 3); + assert!((1..=3).contains(&dim)); assert!(mesh_degree >= 1); assert!(solution_degree >= 1); assert!(num_qpts >= 1); let ncomp_x = dim; - let problem_size: i64; - if problem_size_requested < 0 { - problem_size = if test { 8 * 16 } else { 256 * 1024 }; + let problem_size: i64 = if problem_size_requested < 0 { + if test { + 8 * 16 + } else { + 256 * 1024 + } } else { - problem_size = problem_size_requested; - } + problem_size_requested + }; // Summary output if !quiet { @@ -76,10 +82,20 @@ fn example_1(options: opt::Opt) -> libceed::Result<()> { let ceed = Ceed::init(&ceed_spec); // Mesh and solution bases - let basis_mesh = - ceed.basis_tensor_H1_Lagrange(dim, ncomp_x, mesh_degree + 1, num_qpts, QuadMode::Gauss)?; - let basis_solution = - ceed.basis_tensor_H1_Lagrange(dim, 1, solution_degree + 1, num_qpts, QuadMode::Gauss)?; + let basis_mesh = ceed.basis_tensor_H1_Lagrange( + dim, + ncomp_x, + mesh_degree + 1, + num_qpts, + libceed::QuadMode::Gauss, + )?; + let basis_solution = ceed.basis_tensor_H1_Lagrange( + dim, + 1, + solution_degree + 1, + num_qpts, + libceed::QuadMode::Gauss, + )?; // Determine mesh size from approximate problem size let num_xyz = mesh::cartesian_mesh_size(dim, solution_degree, problem_size); @@ -91,7 +107,7 @@ fn example_1(options: opt::Opt) -> libceed::Result<()> { if dim > 2 { print!(", nz = {}", num_xyz[2]); } - print!("\n"); + println!(); } // Build ElemRestriction objects describing the mesh and solution discrete @@ -158,9 +174,9 @@ fn example_1(options: opt::Opt) -> libceed::Result<()> { }; let qf_build_closure = ceed .q_function_interior(1, Box::new(build_mass))? - .input("dx", ncomp_x * dim, EvalMode::Grad)? - .input("weights", 1, EvalMode::Weight)? - .output("qdata", 1, EvalMode::None)?; + .input("dx", ncomp_x * dim, libceed::EvalMode::Grad)? + .input("weights", 1, libceed::EvalMode::Weight)? + .output("qdata", 1, libceed::EvalMode::None)?; // -- QFunction from gallery let qf_build_named = { let name = format!("Mass{}DBuild", dim); @@ -205,9 +221,9 @@ fn example_1(options: opt::Opt) -> libceed::Result<()> { }; let qf_mass_closure = ceed .q_function_interior(1, Box::new(apply_mass))? - .input("u", 1, EvalMode::Interp)? - .input("qdata", 1, EvalMode::None)? - .output("v", 1, EvalMode::Interp)?; + .input("u", 1, libceed::EvalMode::Interp)? + .input("qdata", 1, libceed::EvalMode::None)? + .output("v", 1, libceed::EvalMode::Interp)?; // -- QFunction from gallery let qf_mass_named = ceed.q_function_interior_by_name("MassApply")?; // -- QFunction for use with Operator @@ -234,7 +250,7 @@ fn example_1(options: opt::Opt) -> libceed::Result<()> { op_mass.apply(&u, &mut v)?; // Compute the mesh volume - let volume: Scalar = v.view()?.iter().sum(); + let volume: libceed::Scalar = v.view()?.iter().sum(); // Output results if !quiet { diff --git a/examples/rust/ex1-volume/src/opt.rs b/examples/rust/ex1-volume/src/opt.rs index 8fd8b71a0d..c93cd17180 100644 --- a/examples/rust/ex1-volume/src/opt.rs +++ b/examples/rust/ex1-volume/src/opt.rs @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -15,7 +15,6 @@ use clap::Parser; name = "libCEED Rust Example 1 - Volume", about = "This example uses the mass matrix to compute the length, area, or volume of a region, depending upon runtime parameters." )] -#[cfg(not(tarpaulin_include))] pub(crate) struct Opt { /// libCEED backend resource to use #[arg(name = "CEED", short, long = "ceed", default_value = "/cpu/self")] diff --git a/examples/rust/ex1-volume/src/transform.rs b/examples/rust/ex1-volume/src/transform.rs index 6ebe14bc6f..7073937353 100644 --- a/examples/rust/ex1-volume/src/transform.rs +++ b/examples/rust/ex1-volume/src/transform.rs @@ -1,27 +1,25 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause // // This file is part of CEED: http://github.com/ceed -use libceed::prelude::*; - // ---------------------------------------------------------------------------- // Transform mesh coordinates // ---------------------------------------------------------------------------- pub(crate) fn transform_mesh_coordinates( dim: usize, mesh_size: usize, - mesh_coords: &mut Vector, -) -> libceed::Result { + mesh_coords: &mut libceed::Vector, +) -> libceed::Result { // Transform coordinates if dim == 1 { for coord in mesh_coords.view_mut()?.iter_mut() { // map [0,1] to [0,1] varying the mesh density *coord = 0.5 - + 1.0 / (3.0 as Scalar).sqrt() - * ((2.0 / 3.0) * std::f64::consts::PI as Scalar * (*coord - 0.5)).sin() + + 1.0 / (3.0 as libceed::Scalar).sqrt() + * ((2.0 / 3.0) * std::f64::consts::PI as libceed::Scalar * (*coord - 0.5)).sin() } } else { let mut coords = mesh_coords.view_mut()?; @@ -30,7 +28,7 @@ pub(crate) fn transform_mesh_coordinates( // map (x,y) from [0,1]x[0,1] to the quarter annulus with polar // coordinates, (r,phi) in [1,2]x[0,pi/2] with area = 3/4*pi let u = 1.0 + coords[i]; - let v = std::f64::consts::PI as Scalar / 2.0 * coords[i + num_nodes]; + let v = std::f64::consts::PI as libceed::Scalar / 2.0 * coords[i + num_nodes]; coords[i] = u * v.cos(); coords[i + num_nodes] = u * v.sin(); } @@ -39,7 +37,8 @@ pub(crate) fn transform_mesh_coordinates( // Exact volume of transformed region let exact_volume = match dim { 1 => 1.0, - _ => 3.0 / 4.0 * std::f64::consts::PI as Scalar, + 2 | 3 => 3.0 / 4.0 * std::f64::consts::PI as libceed::Scalar, + _ => unreachable!(), }; Ok(exact_volume) } diff --git a/examples/rust/ex4-vector-surface/.gitignore b/examples/rust/ex2-surface-vector/.gitignore similarity index 100% rename from examples/rust/ex4-vector-surface/.gitignore rename to examples/rust/ex2-surface-vector/.gitignore diff --git a/examples/rust/ex4-vector-surface/Cargo.toml b/examples/rust/ex2-surface-vector/Cargo.toml similarity index 91% rename from examples/rust/ex4-vector-surface/Cargo.toml rename to examples/rust/ex2-surface-vector/Cargo.toml index 6b41826088..4eac55c52e 100644 --- a/examples/rust/ex4-vector-surface/Cargo.toml +++ b/examples/rust/ex2-surface-vector/Cargo.toml @@ -1,5 +1,5 @@ [package] -name = "ex4-vector-surface" +name = "ex2-surface-vector" version = "0.11.0" authors = [ "Jeremy L Thompson ", diff --git a/examples/rust/ex4-vector-surface/src/main.rs b/examples/rust/ex2-surface-vector/src/main.rs similarity index 87% rename from examples/rust/ex4-vector-surface/src/main.rs rename to examples/rust/ex2-surface-vector/src/main.rs index 5847d8033f..e2ff598d2e 100644 --- a/examples/rust/ex4-vector-surface/src/main.rs +++ b/examples/rust/ex2-surface-vector/src/main.rs @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -21,20 +21,23 @@ // line argument (-ceed). use clap::Parser; -use libceed::{prelude::*, Ceed}; +use libceed::{ + BasisOpt, Ceed, ElemRestrictionOpt, QFunctionInputs, QFunctionOpt, QFunctionOutputs, VectorOpt, +}; mod opt; mod transform; // ---------------------------------------------------------------------------- // Example 4 // ---------------------------------------------------------------------------- -#[cfg(not(tarpaulin_include))] fn main() -> libceed::Result<()> { let options = opt::Opt::parse(); - example_4(options) + example_2_vector(options) } -fn example_4(options: opt::Opt) -> libceed::Result<()> { +#[allow(clippy::erasing_op)] +#[allow(clippy::identity_op)] +fn example_2_vector(options: opt::Opt) -> libceed::Result<()> { // Process command line arguments let opt::Opt { ceed_spec, @@ -47,21 +50,20 @@ fn example_4(options: opt::Opt) -> libceed::Result<()> { quiet, gallery, } = options; - assert!(dim >= 1 && dim <= 3); + assert!((0..=3).contains(&dim)); assert!(mesh_degree >= 1); assert!(solution_degree >= 1); assert!(num_qpts >= 1); let ncomp_x = dim; - let problem_size: i64; - if problem_size_requested < 0 { - problem_size = if test { + let problem_size: i64 = if problem_size_requested < 0 { + if test { 16 * 16 * (dim * dim) as i64 } else { 256 * 1024 - }; + } } else { - problem_size = problem_size_requested; - } + problem_size_requested + }; let ncomp_u = 3; // Summary output @@ -83,14 +85,19 @@ fn example_4(options: opt::Opt) -> libceed::Result<()> { let ceed = Ceed::init(&ceed_spec); // Mesh and solution bases - let basis_mesh = - ceed.basis_tensor_H1_Lagrange(dim, ncomp_x, mesh_degree + 1, num_qpts, QuadMode::Gauss)?; + let basis_mesh = ceed.basis_tensor_H1_Lagrange( + dim, + ncomp_x, + mesh_degree + 1, + num_qpts, + libceed::QuadMode::Gauss, + )?; let basis_solution = ceed.basis_tensor_H1_Lagrange( dim, ncomp_u, solution_degree + 1, num_qpts, - QuadMode::Gauss, + libceed::QuadMode::Gauss, )?; // Determine mesh size from approximate problem size @@ -103,7 +110,7 @@ fn example_4(options: opt::Opt) -> libceed::Result<()> { if dim > 2 { print!(", nz = {}", num_xyz[2]); } - print!("\n"); + println!(); } // Build ElemRestriction objects describing the mesh and solution discrete @@ -207,9 +214,9 @@ fn example_4(options: opt::Opt) -> libceed::Result<()> { }; let qf_build_closure = ceed .q_function_interior(1, Box::new(build_diff))? - .input("dx", ncomp_x * dim, EvalMode::Grad)? - .input("weights", 1, EvalMode::Weight)? - .output("qdata", dim * (dim + 1) / 2, EvalMode::None)?; + .input("dx", ncomp_x * dim, libceed::EvalMode::Grad)? + .input("weights", 1, libceed::EvalMode::Weight)? + .output("qdata", dim * (dim + 1) / 2, libceed::EvalMode::None)?; // -- QFunction from gallery let qf_build_named = { let name = format!("Poisson{}DBuild", dim); @@ -249,7 +256,7 @@ fn example_4(options: opt::Opt) -> libceed::Result<()> { match dim { 1 => { let q = qdata.len(); - for c in 0..3 { + for c in 0..ncomp_u { vg.iter_mut() .skip(c * q) .zip(ug.iter().skip(c * q).zip(qdata.iter())) @@ -259,12 +266,12 @@ fn example_4(options: opt::Opt) -> libceed::Result<()> { 2 => { let q = qdata.len() / 3; for i in 0..q { + let dxdxdxdx_t = [ + [qdata[i + 0 * q], qdata[i + 2 * q]], + [qdata[i + 2 * q], qdata[i + 1 * q]], + ]; for c in 0..ncomp_u { let du = [ug[i + (c + 0 * ncomp_u) * q], ug[i + (c + 1 * ncomp_u) * q]]; - let dxdxdxdx_t = [ - [qdata[i + 0 * q], qdata[i + 2 * q]], - [qdata[i + 2 * q], qdata[i + 1 * q]], - ]; for j in 0..dim { vg[i + (c + j * ncomp_u) * q] = du[0] * dxdxdxdx_t[0][j] + du[1] * dxdxdxdx_t[1][j]; @@ -275,17 +282,17 @@ fn example_4(options: opt::Opt) -> libceed::Result<()> { 3 => { let q = qdata.len() / 6; for i in 0..q { + let dxdxdxdx_t = [ + [qdata[i + 0 * q], qdata[i + 5 * q], qdata[i + 4 * q]], + [qdata[i + 5 * q], qdata[i + 1 * q], qdata[i + 3 * q]], + [qdata[i + 4 * q], qdata[i + 3 * q], qdata[i + 2 * q]], + ]; for c in 0..ncomp_u { let du = [ ug[i + (c + 0 * ncomp_u) * q], ug[i + (c + 1 * ncomp_u) * q], ug[i + (c + 2 * ncomp_u) * q], ]; - let dxdxdxdx_t = [ - [qdata[i + 0 * q], qdata[i + 5 * q], qdata[i + 4 * q]], - [qdata[i + 5 * q], qdata[i + 1 * q], qdata[i + 3 * q]], - [qdata[i + 4 * q], qdata[i + 3 * q], qdata[i + 2 * q]], - ]; for j in 0..dim { vg[i + (c + j * ncomp_u) * q] = du[0] * dxdxdxdx_t[0][j] + du[1] * dxdxdxdx_t[1][j] @@ -302,9 +309,9 @@ fn example_4(options: opt::Opt) -> libceed::Result<()> { }; let qf_diff_closure = ceed .q_function_interior(1, Box::new(apply_diff))? - .input("du", dim * ncomp_u, EvalMode::Grad)? - .input("qdata", dim * (dim + 1) / 2, EvalMode::None)? - .output("dv", dim * ncomp_u, EvalMode::Grad)?; + .input("du", dim * ncomp_u, libceed::EvalMode::Grad)? + .input("qdata", dim * (dim + 1) / 2, libceed::EvalMode::None)? + .output("dv", dim * ncomp_u, libceed::EvalMode::Grad)?; // -- QFunction from gallery let qf_diff_named = { let name = format!("Vector3Poisson{}DApply", dim); @@ -350,7 +357,7 @@ fn example_4(options: opt::Opt) -> libceed::Result<()> { op_diff.apply(&u, &mut v)?; // Compute the mesh surface area - let area: Scalar = v + let area: libceed::Scalar = v .view()? .iter() .map(|v| (*v).abs()) @@ -388,7 +395,7 @@ mod tests { use super::*; #[test] - fn example_4_1d() { + fn example_2_vector_1d() { let options = opt::Opt { ceed_spec: "/cpu/self/ref/serial".to_string(), dim: 1, @@ -400,11 +407,11 @@ mod tests { quiet: true, gallery: false, }; - assert!(example_4(options).is_ok()); + assert!(example_2_vector(options).is_ok()); } #[test] - fn example_4_2d() { + fn example_2_vector_2d() { let options = opt::Opt { ceed_spec: "/cpu/self/ref/serial".to_string(), dim: 2, @@ -416,11 +423,11 @@ mod tests { quiet: true, gallery: false, }; - assert!(example_4(options).is_ok()); + assert!(example_2_vector(options).is_ok()); } #[test] - fn example_4_3d() { + fn example_2_vector_3d() { let options = opt::Opt { ceed_spec: "/cpu/self/ref/serial".to_string(), dim: 3, @@ -432,11 +439,11 @@ mod tests { quiet: false, gallery: false, }; - assert!(example_4(options).is_ok()); + assert!(example_2_vector(options).is_ok()); } #[test] - fn example_4_1d_gallery() { + fn example_2_vector_1d_gallery() { let options = opt::Opt { ceed_spec: "/cpu/self/ref/serial".to_string(), dim: 1, @@ -448,11 +455,11 @@ mod tests { quiet: true, gallery: true, }; - assert!(example_4(options).is_ok()); + assert!(example_2_vector(options).is_ok()); } #[test] - fn example_4_2d_gallery() { + fn example_2_vector_2d_gallery() { let options = opt::Opt { ceed_spec: "/cpu/self/ref/serial".to_string(), dim: 2, @@ -464,11 +471,11 @@ mod tests { quiet: true, gallery: true, }; - assert!(example_4(options).is_ok()); + assert!(example_2_vector(options).is_ok()); } #[test] - fn example_4_3d_gallery() { + fn example_2_vector_3d_gallery() { let options = opt::Opt { ceed_spec: "/cpu/self/ref/serial".to_string(), dim: 3, @@ -480,7 +487,7 @@ mod tests { quiet: true, gallery: true, }; - assert!(example_4(options).is_ok()); + assert!(example_2_vector(options).is_ok()); } } diff --git a/examples/rust/ex4-vector-surface/src/opt.rs b/examples/rust/ex2-surface-vector/src/opt.rs similarity index 95% rename from examples/rust/ex4-vector-surface/src/opt.rs rename to examples/rust/ex2-surface-vector/src/opt.rs index 8f58427120..ecbeb8c3cc 100644 --- a/examples/rust/ex4-vector-surface/src/opt.rs +++ b/examples/rust/ex2-surface-vector/src/opt.rs @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -15,7 +15,6 @@ use clap::Parser; name = "libCEED Rust Example 4 - Vector Surface Area", about = "This example illustrates a simple usage of libCEED to compute the surface area of a body using matrix-free application of a 3 component vector diffusion operator." )] -#[cfg(not(tarpaulin_include))] pub(crate) struct Opt { /// libCEED backend resource to use #[arg(name = "CEED", short, long = "ceed", default_value = "/cpu/self")] diff --git a/examples/rust/ex4-vector-surface/src/transform.rs b/examples/rust/ex2-surface-vector/src/transform.rs similarity index 71% rename from examples/rust/ex4-vector-surface/src/transform.rs rename to examples/rust/ex2-surface-vector/src/transform.rs index 085d9bc94d..43cdbfe0f1 100644 --- a/examples/rust/ex4-vector-surface/src/transform.rs +++ b/examples/rust/ex2-surface-vector/src/transform.rs @@ -1,32 +1,31 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause // // This file is part of CEED: http://github.com/ceed -use libceed::prelude::*; - // ---------------------------------------------------------------------------- // Transform mesh coordinates // ---------------------------------------------------------------------------- pub(crate) fn transform_mesh_coordinates( dim: usize, - mesh_coords: &mut Vector, -) -> libceed::Result { + mesh_coords: &mut libceed::Vector, +) -> libceed::Result { // Transform coordinates for coord in mesh_coords.view_mut()?.iter_mut() { // map [0,1] to [0,1] varying the mesh density *coord = 0.5 - + 1.0 / (3.0 as Scalar).sqrt() - * ((2.0 / 3.0) * std::f64::consts::PI as Scalar * (*coord - 0.5)).sin() + + 1.0 / (3.0 as libceed::Scalar).sqrt() + * ((2.0 / 3.0) * std::f64::consts::PI as libceed::Scalar * (*coord - 0.5)).sin() } // Exact surface area of transformed region let exact_area = match dim { 1 => 2.0, 2 => 4.0, - _ => 6.0, + 3 => 6.0, + _ => unreachable!(), }; Ok(exact_area) } diff --git a/examples/rust/ex2-surface/src/main.rs b/examples/rust/ex2-surface/src/main.rs index 02349bc666..ee66c4663d 100644 --- a/examples/rust/ex2-surface/src/main.rs +++ b/examples/rust/ex2-surface/src/main.rs @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -20,19 +20,22 @@ // line argument (-ceed). use clap::Parser; -use libceed::{prelude::*, Ceed}; +use libceed::{ + BasisOpt, Ceed, ElemRestrictionOpt, QFunctionInputs, QFunctionOpt, QFunctionOutputs, VectorOpt, +}; mod opt; mod transform; // ---------------------------------------------------------------------------- // Example 2 // ---------------------------------------------------------------------------- -#[cfg(not(tarpaulin_include))] fn main() -> libceed::Result<()> { let options = opt::Opt::parse(); example_2(options) } +#[allow(clippy::erasing_op)] +#[allow(clippy::identity_op)] fn example_2(options: opt::Opt) -> libceed::Result<()> { // Process command line arguments let opt::Opt { @@ -46,21 +49,20 @@ fn example_2(options: opt::Opt) -> libceed::Result<()> { quiet, gallery, } = options; - assert!(dim >= 1 && dim <= 3); + assert!((0..=3).contains(&dim)); assert!(mesh_degree >= 1); assert!(solution_degree >= 1); assert!(num_qpts >= 1); let ncomp_x = dim; - let problem_size: i64; - if problem_size_requested < 0 { - problem_size = if test { + let problem_size: i64 = if problem_size_requested < 0 { + if test { 16 * 16 * (dim * dim) as i64 } else { 256 * 1024 - }; + } } else { - problem_size = problem_size_requested; - } + problem_size_requested + }; // Summary output if !quiet { @@ -81,10 +83,20 @@ fn example_2(options: opt::Opt) -> libceed::Result<()> { let ceed = Ceed::init(&ceed_spec); // Mesh and solution bases - let basis_mesh = - ceed.basis_tensor_H1_Lagrange(dim, ncomp_x, mesh_degree + 1, num_qpts, QuadMode::Gauss)?; - let basis_solution = - ceed.basis_tensor_H1_Lagrange(dim, 1, solution_degree + 1, num_qpts, QuadMode::Gauss)?; + let basis_mesh = ceed.basis_tensor_H1_Lagrange( + dim, + ncomp_x, + mesh_degree + 1, + num_qpts, + libceed::QuadMode::Gauss, + )?; + let basis_solution = ceed.basis_tensor_H1_Lagrange( + dim, + 1, + solution_degree + 1, + num_qpts, + libceed::QuadMode::Gauss, + )?; // Determine mesh size from approximate problem size let num_xyz = mesh::cartesian_mesh_size(dim, solution_degree, problem_size); @@ -96,7 +108,7 @@ fn example_2(options: opt::Opt) -> libceed::Result<()> { if dim > 2 { print!(", nz = {}", num_xyz[2]); } - print!("\n"); + println!(); } // Build ElemRestriction objects describing the mesh and solution discrete @@ -111,7 +123,6 @@ fn example_2(options: opt::Opt) -> libceed::Result<()> { dim * (dim + 1) / 2, num_qpts, )?; - let (rstr_solution, _) = mesh::build_cartesian_restriction(&ceed, dim, num_xyz, solution_degree, 1, num_qpts)?; let mesh_size = rstr_mesh.lvector_size(); @@ -200,9 +211,9 @@ fn example_2(options: opt::Opt) -> libceed::Result<()> { }; let qf_build_closure = ceed .q_function_interior(1, Box::new(build_diff))? - .input("dx", ncomp_x * dim, EvalMode::Grad)? - .input("weights", 1, EvalMode::Weight)? - .output("qdata", dim * (dim + 1) / 2, EvalMode::None)?; + .input("dx", ncomp_x * dim, libceed::EvalMode::Grad)? + .input("weights", 1, libceed::EvalMode::Weight)? + .output("qdata", dim * (dim + 1) / 2, libceed::EvalMode::None)?; // -- QFunction from gallery let qf_build_named = { let name = format!("Poisson{}DBuild", dim); @@ -281,9 +292,9 @@ fn example_2(options: opt::Opt) -> libceed::Result<()> { }; let qf_diff_closure = ceed .q_function_interior(1, Box::new(apply_diff))? - .input("du", dim, EvalMode::Grad)? - .input("qdata", dim * (dim + 1) / 2, EvalMode::None)? - .output("dv", dim, EvalMode::Grad)?; + .input("du", dim, libceed::EvalMode::Grad)? + .input("qdata", dim * (dim + 1) / 2, libceed::EvalMode::None)? + .output("dv", dim, libceed::EvalMode::Grad)?; // -- QFunction from gallery let qf_diff_named = { let name = format!("Poisson{}DApply", dim); @@ -320,7 +331,7 @@ fn example_2(options: opt::Opt) -> libceed::Result<()> { op_diff.apply(&u, &mut v)?; // Compute the mesh surface area - let area: Scalar = v.view()?.iter().map(|v| (*v).abs()).sum(); + let area: libceed::Scalar = v.view()?.iter().map(|v| (*v).abs()).sum(); // Output results if !quiet { diff --git a/examples/rust/ex2-surface/src/opt.rs b/examples/rust/ex2-surface/src/opt.rs index 13b58f26d7..f2c1afc8f2 100644 --- a/examples/rust/ex2-surface/src/opt.rs +++ b/examples/rust/ex2-surface/src/opt.rs @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -15,7 +15,6 @@ use clap::Parser; name = "libCEED Rust Example 2 - Surface Area", about = "This example illustrates a simple usage of libCEED to compute the surface area of a body using matrix-free application of a diffusion operator." )] -#[cfg(not(tarpaulin_include))] pub(crate) struct Opt { /// libCEED backend resource to use #[arg(name = "CEED", short, long = "ceed", default_value = "/cpu/self")] diff --git a/examples/rust/ex2-surface/src/transform.rs b/examples/rust/ex2-surface/src/transform.rs index 085d9bc94d..43cdbfe0f1 100644 --- a/examples/rust/ex2-surface/src/transform.rs +++ b/examples/rust/ex2-surface/src/transform.rs @@ -1,32 +1,31 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause // // This file is part of CEED: http://github.com/ceed -use libceed::prelude::*; - // ---------------------------------------------------------------------------- // Transform mesh coordinates // ---------------------------------------------------------------------------- pub(crate) fn transform_mesh_coordinates( dim: usize, - mesh_coords: &mut Vector, -) -> libceed::Result { + mesh_coords: &mut libceed::Vector, +) -> libceed::Result { // Transform coordinates for coord in mesh_coords.view_mut()?.iter_mut() { // map [0,1] to [0,1] varying the mesh density *coord = 0.5 - + 1.0 / (3.0 as Scalar).sqrt() - * ((2.0 / 3.0) * std::f64::consts::PI as Scalar * (*coord - 0.5)).sin() + + 1.0 / (3.0 as libceed::Scalar).sqrt() + * ((2.0 / 3.0) * std::f64::consts::PI as libceed::Scalar * (*coord - 0.5)).sin() } // Exact surface area of transformed region let exact_area = match dim { 1 => 2.0, 2 => 4.0, - _ => 6.0, + 3 => 6.0, + _ => unreachable!(), }; Ok(exact_area) } diff --git a/examples/rust/ex3-volume-vector/.gitignore b/examples/rust/ex3-volume-vector/.gitignore new file mode 100644 index 0000000000..a9d37c560c --- /dev/null +++ b/examples/rust/ex3-volume-vector/.gitignore @@ -0,0 +1,2 @@ +target +Cargo.lock diff --git a/examples/rust/ex3-volume-vector/Cargo.toml b/examples/rust/ex3-volume-vector/Cargo.toml new file mode 100644 index 0000000000..bfbe7241e0 --- /dev/null +++ b/examples/rust/ex3-volume-vector/Cargo.toml @@ -0,0 +1,15 @@ +[package] +name = "ex3-volume-vector" +version = "0.11.0" +authors = [ + "Jeremy L Thompson ", +] +edition = "2018" + +[dependencies] +clap = { version = "4.0.17", features = ["derive"] } +libceed = { path = "../../../rust/libceed" } +mesh = { path = "../mesh" } + +[package.metadata.release] +release = false diff --git a/examples/rust/ex3-volume-vector/src/main.rs b/examples/rust/ex3-volume-vector/src/main.rs new file mode 100644 index 0000000000..2eb530470e --- /dev/null +++ b/examples/rust/ex3-volume-vector/src/main.rs @@ -0,0 +1,438 @@ +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. +// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +// +// SPDX-License-Identifier: BSD-2-Clause +// +// This file is part of CEED: http://github.com/ceed +// +// libCEED Example 1 +// +// This example illustrates a simple usage of libCEED to compute the volume of a +// 3D body using matrix-free application of a mass + diff operator. Arbitrary +// mesh and solution orders in 1D, 2D and 3D are supported from the same code. +// This calculation is executed in triplicate with a 3 component vector system. +// +// The example has no dependencies, and is designed to be self-contained. For +// additional examples that use external discretization libraries (MFEM, PETSc, +// etc.) see the subdirectories in libceed/examples. +// +// All libCEED objects use a Ceed device object constructed based on a command +// line argument (-ceed). + +use clap::Parser; +use libceed::{ + BasisOpt, Ceed, ElemRestrictionOpt, QFunctionInputs, QFunctionOpt, QFunctionOutputs, VectorOpt, +}; +mod opt; +mod transform; + +// ---------------------------------------------------------------------------- +// Example 1 +// ---------------------------------------------------------------------------- +fn main() -> libceed::Result<()> { + let options = opt::Opt::parse(); + example_3_vector(options) +} + +#[allow(clippy::erasing_op)] +#[allow(clippy::identity_op)] +fn example_3_vector(options: opt::Opt) -> libceed::Result<()> { + // Process command line arguments + let opt::Opt { + ceed_spec, + dim, + mesh_degree, + solution_degree, + num_qpts, + problem_size_requested, + test, + quiet, + } = options; + assert!((1..=3).contains(&dim)); + assert!(mesh_degree >= 1); + assert!(solution_degree >= 1); + assert!(num_qpts >= 1); + let ncomp_x = dim; + let problem_size: i64 = if problem_size_requested < 0 { + if test { + 8 * 16 + } else { + 256 * 1024 + } + } else { + problem_size_requested + }; + let ncomp_u = 3; + + // Summary output + if !quiet { + println!("Selected options: [command line option] : "); + println!(" Ceed specification [-c] : {}", ceed_spec); + println!(" Mesh dimension [-d] : {}", dim); + println!(" Mesh degree [-m] : {}", mesh_degree); + println!(" Solution degree [-p] : {}", solution_degree); + println!(" Num. 1D quadr. pts [-q] : {}", num_qpts); + println!(" Approx. # unknowns [-s] : {}", problem_size); + println!(" QFunction source : user closure"); + } + + // Initalize ceed context + let ceed = Ceed::init(&ceed_spec); + + // Mesh and solution bases + let basis_mesh = ceed.basis_tensor_H1_Lagrange( + dim, + ncomp_x, + mesh_degree + 1, + num_qpts, + libceed::QuadMode::Gauss, + )?; + let basis_solution = ceed.basis_tensor_H1_Lagrange( + dim, + ncomp_u, + solution_degree + 1, + num_qpts, + libceed::QuadMode::Gauss, + )?; + + // Determine mesh size from approximate problem size + let num_xyz = mesh::cartesian_mesh_size(dim, solution_degree, problem_size); + if !quiet { + print!("\nMesh size : nx = {}", num_xyz[0]); + if dim > 1 { + print!(", ny = {}", num_xyz[1]); + } + if dim > 2 { + print!(", nz = {}", num_xyz[2]); + } + println!(); + } + + // Build ElemRestriction objects describing the mesh and solution discrete + // representations + let (rstr_mesh, _) = + mesh::build_cartesian_restriction(&ceed, dim, num_xyz, mesh_degree, ncomp_x, num_qpts)?; + let (_, rstr_qdata) = mesh::build_cartesian_restriction( + &ceed, + dim, + num_xyz, + solution_degree, + 1 + dim * (dim + 1) / 2, + num_qpts, + )?; + let (rstr_solution, _) = + mesh::build_cartesian_restriction(&ceed, dim, num_xyz, solution_degree, ncomp_u, num_qpts)?; + let mesh_size = rstr_mesh.lvector_size(); + let solution_size = rstr_solution.lvector_size(); + if !quiet { + println!("Number of mesh nodes : {}", mesh_size / dim); + println!("Number of solution nodes : {}", solution_size); + } + + // Create a Vector with the mesh coordinates + let mut mesh_coords = mesh::cartesian_mesh_coords(&ceed, dim, num_xyz, mesh_degree, mesh_size)?; + + // Apply a transformation to the mesh coordinates + let exact_volume = transform::transform_mesh_coordinates(dim, mesh_size, &mut mesh_coords)?; + + // QFunction that builds the quadrature data for the mass + diff operator + // -- QFunction from user closure + let build_mass_diff = move |[jacobian, weights, ..]: QFunctionInputs, + [qdata, ..]: QFunctionOutputs| { + // Build quadrature data + match dim { + 1 => { + let q = qdata.len() / 2; + for i in 0..q { + // Mass + qdata[i + q * 0] = weights[i] * jacobian[i]; + // Diff + qdata[i + q * 1] = weights[i] / jacobian[i]; + } + } + 2 => { + let q = qdata.len() / 4; + for i in 0..q { + let j11 = jacobian[i + q * 0]; + let j21 = jacobian[i + q * 1]; + let j12 = jacobian[i + q * 2]; + let j22 = jacobian[i + q * 3]; + // Mass + qdata[i + q * 0] = weights[i] * (j11 * j22 - j21 * j12); + // Diff + let qw = weights[i] / (j11 * j22 - j21 * j12); + qdata[i + q * 1] = qw * (j12 * j12 + j22 * j22); + qdata[i + q * 2] = qw * (j11 * j11 + j21 * j21); + qdata[i + q * 3] = -qw * (j11 * j12 + j21 * j22); + } + } + 3 => { + let q = qdata.len() / 7; + for i in 0..q { + let mut a = [0.0; 9]; + for j in 0..3 { + for k in 0..3 { + a[k * 3 + j] = jacobian[i + q * ((j + 1) % 3 + 3 * ((k + 1) % 3))] + * jacobian[i + q * ((j + 2) % 3 + 3 * ((k + 2) % 3))] + - jacobian[i + q * ((j + 1) % 3 + 3 * ((k + 2) % 3))] + * jacobian[i + q * ((j + 2) % 3 + 3 * ((k + 1) % 3))]; + } + } + // Mass + qdata[i + q * 0] = weights[i] + * (jacobian[i + q * 0] * a[0 * 3 + 0] + + jacobian[i + q * 1] * a[0 * 3 + 1] + + jacobian[i + q * 2] * a[0 * 3 + 2]); + let qw = weights[i] + / (jacobian[i + q * 0] * a[0 * 3 + 0] + + jacobian[i + q * 1] * a[0 * 3 + 1] + + jacobian[i + q * 2] * a[0 * 3 + 2]); + // Diff + qdata[i + q * 1] = qw + * (a[0 * 3 + 0] * a[0 * 3 + 0] + + a[0 * 3 + 1] * a[0 * 3 + 1] + + a[0 * 3 + 2] * a[0 * 3 + 2]); + qdata[i + q * 2] = qw + * (a[1 * 3 + 0] * a[1 * 3 + 0] + + a[1 * 3 + 1] * a[1 * 3 + 1] + + a[1 * 3 + 2] * a[1 * 3 + 2]); + qdata[i + q * 3] = qw + * (a[2 * 3 + 0] * a[2 * 3 + 0] + + a[2 * 3 + 1] * a[2 * 3 + 1] + + a[2 * 3 + 2] * a[2 * 3 + 2]); + qdata[i + q * 4] = qw + * (a[1 * 3 + 0] * a[2 * 3 + 0] + + a[1 * 3 + 1] * a[2 * 3 + 1] + + a[1 * 3 + 2] * a[2 * 3 + 2]); + qdata[i + q * 5] = qw + * (a[0 * 3 + 0] * a[2 * 3 + 0] + + a[0 * 3 + 1] * a[2 * 3 + 1] + + a[0 * 3 + 2] * a[2 * 3 + 2]); + qdata[i + q * 6] = qw + * (a[0 * 3 + 0] * a[1 * 3 + 0] + + a[0 * 3 + 1] * a[1 * 3 + 1] + + a[0 * 3 + 2] * a[1 * 3 + 2]); + } + } + _ => unreachable!(), + }; + + // Return clean error code + 0 + }; + let qf_build_closure = ceed + .q_function_interior(1, Box::new(build_mass_diff))? + .input("dx", ncomp_x * dim, libceed::EvalMode::Grad)? + .input("weights", 1, libceed::EvalMode::Weight)? + .output("qdata", 1 + dim * (dim + 1) / 2, libceed::EvalMode::None)?; + // -- QFunction for use with Operator + let qf_build = QFunctionOpt::SomeQFunction(&qf_build_closure); + + // Operator that build the quadrature data for the mass + diff operator + let op_build = ceed + .operator(qf_build, QFunctionOpt::None, QFunctionOpt::None)? + .name("build qdata")? + .field("dx", &rstr_mesh, &basis_mesh, VectorOpt::Active)? + .field( + "weights", + ElemRestrictionOpt::None, + &basis_mesh, + VectorOpt::None, + )? + .field("qdata", &rstr_qdata, BasisOpt::None, VectorOpt::Active)? + .check()?; + + // Compute the quadrature data for the mass + diff operator + let elem_qpts = num_qpts.pow(dim as u32); + let num_elem: usize = num_xyz.iter().take(dim).product(); + let mut qdata = ceed.vector(num_elem * elem_qpts * (1 + dim * (dim + 1) / 2))?; + op_build.apply(&mesh_coords, &mut qdata)?; + + // QFunction that applies the mass + diff operator + // -- QFunction from user closure + let apply_mass_diff = move |[u, ug, qdata, ..]: QFunctionInputs, + [v, vg, ..]: QFunctionOutputs| { + // Apply diffusion operator + match dim { + 1 => { + let q = qdata.len() / 2; + for i in 0..q { + for c in 0..ncomp_u { + // Mass + v[i + c * q] = u[i + c * q] * qdata[i + 0 * q]; + // Diff + vg[i + c * q] = ug[i + c * q] * qdata[i + 1 * q]; + } + } + } + 2 => { + let q = qdata.len() / 4; + for i in 0..q { + let dxdxdxdx_t = [ + [qdata[i + 1 * q], qdata[i + 3 * q]], + [qdata[i + 3 * q], qdata[i + 2 * q]], + ]; + for c in 0..ncomp_u { + // Mass + v[i + c * q] = u[i + c * q] * qdata[i + 0 * q]; + // Diff + let du = [ug[i + (c + 0 * ncomp_u) * q], ug[i + (c + 1 * ncomp_u) * q]]; + for j in 0..2 { + vg[i + (j + j * ncomp_u) * q] = + du[0] * dxdxdxdx_t[0][j] + du[1] * dxdxdxdx_t[1][j]; + } + } + } + } + 3 => { + let q = qdata.len() / 7; + for i in 0..q { + let dxdxdxdx_t = [ + [qdata[i + 1 * q], qdata[i + 6 * q], qdata[i + 5 * q]], + [qdata[i + 6 * q], qdata[i + 2 * q], qdata[i + 4 * q]], + [qdata[i + 5 * q], qdata[i + 4 * q], qdata[i + 3 * q]], + ]; + for c in 0..ncomp_u { + // Mass + v[i + c * q] = u[i + c * q] * qdata[i + 0 * q]; + // Diff + let du = [ + ug[i + (c + 0 * ncomp_u) * q], + ug[i + (c + 1 * ncomp_u) * q], + ug[i + (c + 2 * ncomp_u) * q], + ]; + for j in 0..3 { + vg[i + (c + j * ncomp_u) * q] = du[0] * dxdxdxdx_t[0][j] + + du[1] * dxdxdxdx_t[1][j] + + du[2] * dxdxdxdx_t[2][j]; + } + } + } + } + _ => unreachable!(), + }; + + // Return clean error code + 0 + }; + let qf_mass_diff_closure = ceed + .q_function_interior(1, Box::new(apply_mass_diff))? + .input("u", ncomp_u, libceed::EvalMode::Interp)? + .input("du", dim * ncomp_u, libceed::EvalMode::Grad)? + .input("qdata", 1 + dim * (dim + 1) / 2, libceed::EvalMode::None)? + .output("v", ncomp_u, libceed::EvalMode::Interp)? + .output("dv", dim * ncomp_u, libceed::EvalMode::Grad)?; + // -- QFunction for use with Operator + let qf_mass_diff = QFunctionOpt::SomeQFunction(&qf_mass_diff_closure); + + // Mass + diff Operator + let op_mass_diff = ceed + .operator(qf_mass_diff, QFunctionOpt::None, QFunctionOpt::None)? + .name("mass diff")? + .field("u", &rstr_solution, &basis_solution, VectorOpt::Active)? + .field("du", &rstr_solution, &basis_solution, VectorOpt::Active)? + .field("qdata", &rstr_qdata, BasisOpt::None, &qdata)? + .field("v", &rstr_solution, &basis_solution, VectorOpt::Active)? + .field("dv", &rstr_solution, &basis_solution, VectorOpt::Active)? + .check()?; + + // Solution vectors + let mut u = ceed.vector(solution_size)?; + let mut v = ceed.vector(solution_size)?; + + // Initialize u with component index + u.set_value(0.0)?; + for c in 0..ncomp_u { + let q = solution_size / ncomp_u; + u.view_mut()?.iter_mut().skip(c * q).take(q).for_each(|u| { + *u = (c + 1) as libceed::Scalar; + }); + } + + // Apply the mass + diff operator + op_mass_diff.apply(&u, &mut v)?; + + // Compute the mesh volume + let volume: libceed::Scalar = v.view()?.iter().sum::() + / ((ncomp_u * (ncomp_u + 1)) / 2) as libceed::Scalar; + + // Output results + if !quiet { + println!("Exact mesh volume : {:.12}", exact_volume); + println!("Computed mesh volume : {:.12}", volume); + println!( + "Volume error : {:.12e}", + volume - exact_volume + ); + } + let tolerance = match dim { + 1 => 200.0 * libceed::EPSILON, + _ => 1E-5, + }; + let error = (volume - exact_volume).abs(); + if error > tolerance { + println!("Volume error too large: {:.12e}", error); + return Err(libceed::Error { + message: format!( + "Volume error too large - expected: {:.12e}, actual: {:.12e}", + tolerance, error + ), + }); + } + Ok(()) +} + +// ---------------------------------------------------------------------------- +// Tests +// ---------------------------------------------------------------------------- +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn example_3_vector_1d() { + let options = opt::Opt { + ceed_spec: "/cpu/self/ref/serial".to_string(), + dim: 1, + mesh_degree: 4, + solution_degree: 4, + num_qpts: 6, + problem_size_requested: -1, + test: true, + quiet: true, + }; + assert!(example_3_vector(options).is_ok()); + } + + #[test] + fn example_3_vector_2d() { + let options = opt::Opt { + ceed_spec: "/cpu/self/ref/serial".to_string(), + dim: 2, + mesh_degree: 4, + solution_degree: 4, + num_qpts: 6, + problem_size_requested: -1, + test: true, + quiet: true, + }; + assert!(example_3_vector(options).is_ok()); + } + + #[test] + fn example_3_vector_vector_3d() { + let options = opt::Opt { + ceed_spec: "/cpu/self/ref/serial".to_string(), + dim: 3, + mesh_degree: 4, + solution_degree: 4, + num_qpts: 6, + problem_size_requested: -1, + test: true, + quiet: false, + }; + assert!(example_3_vector(options).is_ok()); + } +} + +// ---------------------------------------------------------------------------- diff --git a/examples/rust/ex3-volume-vector/src/opt.rs b/examples/rust/ex3-volume-vector/src/opt.rs new file mode 100644 index 0000000000..edf546b032 --- /dev/null +++ b/examples/rust/ex3-volume-vector/src/opt.rs @@ -0,0 +1,45 @@ +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. +// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +// +// SPDX-License-Identifier: BSD-2-Clause +// +// This file is part of CEED: http://github.com/ceed + +use clap::Parser; + +// ---------------------------------------------------------------------------- +// Command line arguments +// ---------------------------------------------------------------------------- +#[derive(Debug, Parser)] +#[command( + name = "libCEED Rust Example 3 - Volume", + about = "This example uses the mass matrix and diffusion matrices to compute the length, area, or volume of a region, depending upon runtime parameters." +)] +pub(crate) struct Opt { + /// libCEED backend resource to use + #[arg(name = "CEED", short, long = "ceed", default_value = "/cpu/self")] + pub(crate) ceed_spec: String, + /// Mesh dimension + #[arg(short, long = "dimension", default_value = "3")] + pub(crate) dim: usize, + /// Polynomial degree for the mesh + #[arg(short, long, default_value = "4")] + pub(crate) mesh_degree: usize, + /// Polynomial degree for the solution + #[arg(short = 'p', long, default_value = "4")] + pub(crate) solution_degree: usize, + /// Number of quadrature points in 1D + #[arg(short = 'q', long, default_value = "6")] + pub(crate) num_qpts: usize, + /// Approximate problem size + #[arg(name = "DoF", short = 's', long = "problem_size", default_value = "-1")] + pub(crate) problem_size_requested: i64, + /// Test mode + #[arg(short, long)] + pub(crate) test: bool, + /// Quiet mode + #[arg(short = 'x', long)] + pub(crate) quiet: bool, +} + +// ---------------------------------------------------------------------------- diff --git a/examples/rust/ex3-volume-vector/src/transform.rs b/examples/rust/ex3-volume-vector/src/transform.rs new file mode 100644 index 0000000000..e022a34860 --- /dev/null +++ b/examples/rust/ex3-volume-vector/src/transform.rs @@ -0,0 +1,50 @@ +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. +// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +// +// SPDX-License-Identifier: BSD-2-Clause +// +// This file is part of CEED: http://github.com/ceed + +// ---------------------------------------------------------------------------- +// Transform mesh coordinates +// ---------------------------------------------------------------------------- +pub(crate) fn transform_mesh_coordinates( + dim: usize, + mesh_size: usize, + mesh_coords: &mut libceed::Vector, +) -> libceed::Result { + // Transform coordinates + match dim { + 1 => { + for coord in mesh_coords.view_mut()?.iter_mut() { + // map [0,1] to [0,1] varying the mesh density + *coord = 0.5 + + 1.0 / (3.0 as libceed::Scalar).sqrt() + * ((2.0 / 3.0) * std::f64::consts::PI as libceed::Scalar * (*coord - 0.5)) + .sin() + } + } + _ => { + let num_nodes = mesh_size / dim; + let mut coords = mesh_coords.view_mut()?; + for i in 0..num_nodes { + // map (x,y) from [0,1]x[0,1] to the quarter annulus with polar + // coordinates, (r,phi) in [1,2]x[0,pi/2] with area = 3/4*pi + let u = coords[i] + 1.; + let v = coords[i + num_nodes] * std::f64::consts::PI / 2.; + coords[i] = u * v.cos(); + coords[i + num_nodes] = u * v.sin(); + } + } + } + + // Exact volume of transformed region + let exact_volume = match dim { + 1 => 1., + 2 | 3 => 3. / 4. * std::f64::consts::PI, + _ => unreachable!(), + }; + Ok(exact_volume) +} + +// ---------------------------------------------------------------------------- diff --git a/examples/rust/ex3-volume/.gitignore b/examples/rust/ex3-volume/.gitignore new file mode 100644 index 0000000000..a9d37c560c --- /dev/null +++ b/examples/rust/ex3-volume/.gitignore @@ -0,0 +1,2 @@ +target +Cargo.lock diff --git a/examples/rust/ex3-volume/Cargo.toml b/examples/rust/ex3-volume/Cargo.toml new file mode 100644 index 0000000000..0c2f979c6b --- /dev/null +++ b/examples/rust/ex3-volume/Cargo.toml @@ -0,0 +1,15 @@ +[package] +name = "ex3-volume" +version = "0.11.0" +authors = [ + "Jeremy L Thompson ", +] +edition = "2018" + +[dependencies] +clap = { version = "4.0.17", features = ["derive"] } +libceed = { path = "../../../rust/libceed" } +mesh = { path = "../mesh" } + +[package.metadata.release] +release = false diff --git a/examples/rust/ex3-volume/src/main.rs b/examples/rust/ex3-volume/src/main.rs new file mode 100644 index 0000000000..16c3dfcfa3 --- /dev/null +++ b/examples/rust/ex3-volume/src/main.rs @@ -0,0 +1,415 @@ +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. +// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +// +// SPDX-License-Identifier: BSD-2-Clause +// +// This file is part of CEED: http://github.com/ceed +// +// libCEED Example 1 +// +// This example illustrates a simple usage of libCEED to compute the volume of a +// 3D body using matrix-free application of a mass + diff operator. Arbitrary +// mesh and solution orders in 1D, 2D and 3D are supported from the same code. +// +// The example has no dependencies, and is designed to be self-contained. For +// additional examples that use external discretization libraries (MFEM, PETSc, +// etc.) see the subdirectories in libceed/examples. +// +// All libCEED objects use a Ceed device object constructed based on a command +// line argument (-ceed). + +use clap::Parser; +use libceed::{ + BasisOpt, Ceed, ElemRestrictionOpt, QFunctionInputs, QFunctionOpt, QFunctionOutputs, VectorOpt, +}; +mod opt; +mod transform; + +// ---------------------------------------------------------------------------- +// Example 1 +// ---------------------------------------------------------------------------- +fn main() -> libceed::Result<()> { + let options = opt::Opt::parse(); + example_3(options) +} + +#[allow(clippy::erasing_op)] +#[allow(clippy::identity_op)] +fn example_3(options: opt::Opt) -> libceed::Result<()> { + // Process command line arguments + let opt::Opt { + ceed_spec, + dim, + mesh_degree, + solution_degree, + num_qpts, + problem_size_requested, + test, + quiet, + } = options; + assert!((1..=3).contains(&dim)); + assert!(mesh_degree >= 1); + assert!(solution_degree >= 1); + assert!(num_qpts >= 1); + let ncomp_x = dim; + let problem_size: i64 = if problem_size_requested < 0 { + if test { + 8 * 16 + } else { + 256 * 1024 + } + } else { + problem_size_requested + }; + + // Summary output + if !quiet { + println!("Selected options: [command line option] : "); + println!(" Ceed specification [-c] : {}", ceed_spec); + println!(" Mesh dimension [-d] : {}", dim); + println!(" Mesh degree [-m] : {}", mesh_degree); + println!(" Solution degree [-p] : {}", solution_degree); + println!(" Num. 1D quadr. pts [-q] : {}", num_qpts); + println!(" Approx. # unknowns [-s] : {}", problem_size); + println!(" QFunction source : user closure"); + } + + // Initalize ceed context + let ceed = Ceed::init(&ceed_spec); + + // Mesh and solution bases + let basis_mesh = ceed.basis_tensor_H1_Lagrange( + dim, + ncomp_x, + mesh_degree + 1, + num_qpts, + libceed::QuadMode::Gauss, + )?; + let basis_solution = ceed.basis_tensor_H1_Lagrange( + dim, + 1, + solution_degree + 1, + num_qpts, + libceed::QuadMode::Gauss, + )?; + + // Determine mesh size from approximate problem size + let num_xyz = mesh::cartesian_mesh_size(dim, solution_degree, problem_size); + if !quiet { + print!("\nMesh size : nx = {}", num_xyz[0]); + if dim > 1 { + print!(", ny = {}", num_xyz[1]); + } + if dim > 2 { + print!(", nz = {}", num_xyz[2]); + } + println!(); + } + + // Build ElemRestriction objects describing the mesh and solution discrete + // representations + let (rstr_mesh, _) = + mesh::build_cartesian_restriction(&ceed, dim, num_xyz, mesh_degree, ncomp_x, num_qpts)?; + let (_, rstr_qdata) = mesh::build_cartesian_restriction( + &ceed, + dim, + num_xyz, + solution_degree, + 1 + dim * (dim + 1) / 2, + num_qpts, + )?; + let (rstr_solution, _) = + mesh::build_cartesian_restriction(&ceed, dim, num_xyz, solution_degree, 1, num_qpts)?; + let mesh_size = rstr_mesh.lvector_size(); + let solution_size = rstr_solution.lvector_size(); + if !quiet { + println!("Number of mesh nodes : {}", mesh_size / dim); + println!("Number of solution nodes : {}", solution_size); + } + + // Create a Vector with the mesh coordinates + let mut mesh_coords = mesh::cartesian_mesh_coords(&ceed, dim, num_xyz, mesh_degree, mesh_size)?; + + // Apply a transformation to the mesh coordinates + let exact_volume = transform::transform_mesh_coordinates(dim, mesh_size, &mut mesh_coords)?; + + // QFunction that builds the quadrature data for the mass + diff operator + // -- QFunction from user closure + let build_mass_diff = move |[jacobian, weights, ..]: QFunctionInputs, + [qdata, ..]: QFunctionOutputs| { + // Build quadrature data + match dim { + 1 => { + let q = qdata.len() / 2; + for i in 0..q { + // Mass + qdata[i + q * 0] = weights[i] * jacobian[i]; + // Diff + qdata[i + q * 1] = weights[i] / jacobian[i]; + } + } + 2 => { + let q = qdata.len() / 4; + for i in 0..q { + let j11 = jacobian[i + q * 0]; + let j21 = jacobian[i + q * 1]; + let j12 = jacobian[i + q * 2]; + let j22 = jacobian[i + q * 3]; + // Mass + qdata[i + q * 0] = weights[i] * (j11 * j22 - j21 * j12); + // Diff + let qw = weights[i] / (j11 * j22 - j21 * j12); + qdata[i + q * 1] = qw * (j12 * j12 + j22 * j22); + qdata[i + q * 2] = qw * (j11 * j11 + j21 * j21); + qdata[i + q * 3] = -qw * (j11 * j12 + j21 * j22); + } + } + 3 => { + let q = qdata.len() / 7; + for i in 0..q { + let mut a = [0.0; 9]; + for j in 0..3 { + for k in 0..3 { + a[k * 3 + j] = jacobian[i + q * ((j + 1) % 3 + 3 * ((k + 1) % 3))] + * jacobian[i + q * ((j + 2) % 3 + 3 * ((k + 2) % 3))] + - jacobian[i + q * ((j + 1) % 3 + 3 * ((k + 2) % 3))] + * jacobian[i + q * ((j + 2) % 3 + 3 * ((k + 1) % 3))]; + } + } + // Mass + qdata[i + q * 0] = weights[i] + * (jacobian[i + q * 0] * a[0 * 3 + 0] + + jacobian[i + q * 1] * a[0 * 3 + 1] + + jacobian[i + q * 2] * a[0 * 3 + 2]); + let qw = weights[i] + / (jacobian[i + q * 0] * a[0 * 3 + 0] + + jacobian[i + q * 1] * a[0 * 3 + 1] + + jacobian[i + q * 2] * a[0 * 3 + 2]); + // Diff + qdata[i + q * 1] = qw + * (a[0 * 3 + 0] * a[0 * 3 + 0] + + a[0 * 3 + 1] * a[0 * 3 + 1] + + a[0 * 3 + 2] * a[0 * 3 + 2]); + qdata[i + q * 2] = qw + * (a[1 * 3 + 0] * a[1 * 3 + 0] + + a[1 * 3 + 1] * a[1 * 3 + 1] + + a[1 * 3 + 2] * a[1 * 3 + 2]); + qdata[i + q * 3] = qw + * (a[2 * 3 + 0] * a[2 * 3 + 0] + + a[2 * 3 + 1] * a[2 * 3 + 1] + + a[2 * 3 + 2] * a[2 * 3 + 2]); + qdata[i + q * 4] = qw + * (a[1 * 3 + 0] * a[2 * 3 + 0] + + a[1 * 3 + 1] * a[2 * 3 + 1] + + a[1 * 3 + 2] * a[2 * 3 + 2]); + qdata[i + q * 5] = qw + * (a[0 * 3 + 0] * a[2 * 3 + 0] + + a[0 * 3 + 1] * a[2 * 3 + 1] + + a[0 * 3 + 2] * a[2 * 3 + 2]); + qdata[i + q * 6] = qw + * (a[0 * 3 + 0] * a[1 * 3 + 0] + + a[0 * 3 + 1] * a[1 * 3 + 1] + + a[0 * 3 + 2] * a[1 * 3 + 2]); + } + } + _ => unreachable!(), + }; + + // Return clean error code + 0 + }; + let qf_build_closure = ceed + .q_function_interior(1, Box::new(build_mass_diff))? + .input("dx", ncomp_x * dim, libceed::EvalMode::Grad)? + .input("weights", 1, libceed::EvalMode::Weight)? + .output("qdata", 1 + dim * (dim + 1) / 2, libceed::EvalMode::None)?; + // -- QFunction for use with Operator + let qf_build = QFunctionOpt::SomeQFunction(&qf_build_closure); + + // Operator that build the quadrature data for the mass + diff operator + let op_build = ceed + .operator(qf_build, QFunctionOpt::None, QFunctionOpt::None)? + .name("build qdata")? + .field("dx", &rstr_mesh, &basis_mesh, VectorOpt::Active)? + .field( + "weights", + ElemRestrictionOpt::None, + &basis_mesh, + VectorOpt::None, + )? + .field("qdata", &rstr_qdata, BasisOpt::None, VectorOpt::Active)? + .check()?; + + // Compute the quadrature data for the mass + diff operator + let elem_qpts = num_qpts.pow(dim as u32); + let num_elem: usize = num_xyz.iter().take(dim).product(); + let mut qdata = ceed.vector(num_elem * elem_qpts * (1 + dim * (dim + 1) / 2))?; + op_build.apply(&mesh_coords, &mut qdata)?; + + // QFunction that applies the mass + diff operator + // -- QFunction from user closure + let apply_mass_diff = move |[u, ug, qdata, ..]: QFunctionInputs, + [v, vg, ..]: QFunctionOutputs| { + // Apply diffusion operator + match dim { + 1 => { + let q = qdata.len() / 2; + for i in 0..q { + // Mass + v[i] = u[i] * qdata[i + 0 * q]; + // Diff + vg[i] = ug[i] * qdata[i + 1 * q]; + } + } + 2 => { + let q = qdata.len() / 4; + for i in 0..q { + // Mass + v[i] = u[i] * qdata[i + 0 * q]; + // Diff + let du = [ug[i + q * 0], ug[i + q * 1]]; + let dxdxdxdx_t = [ + [qdata[i + 1 * q], qdata[i + 3 * q]], + [qdata[i + 3 * q], qdata[i + 2 * q]], + ]; + for j in 0..2 { + vg[i + j * q] = du[0] * dxdxdxdx_t[0][j] + du[1] * dxdxdxdx_t[1][j]; + } + } + } + 3 => { + let q = qdata.len() / 7; + for i in 0..q { + // Mass + v[i] = u[i] * qdata[i + 0 * q]; + // Diff + let du = [ug[i + q * 0], ug[i + q * 1], ug[i + q * 2]]; + let dxdxdxdx_t = [ + [qdata[i + 1 * q], qdata[i + 6 * q], qdata[i + 5 * q]], + [qdata[i + 6 * q], qdata[i + 2 * q], qdata[i + 4 * q]], + [qdata[i + 5 * q], qdata[i + 4 * q], qdata[i + 3 * q]], + ]; + for j in 0..3 { + vg[i + j * q] = du[0] * dxdxdxdx_t[0][j] + + du[1] * dxdxdxdx_t[1][j] + + du[2] * dxdxdxdx_t[2][j]; + } + } + } + _ => unreachable!(), + }; + + // Return clean error code + 0 + }; + let qf_mass_diff_closure = ceed + .q_function_interior(1, Box::new(apply_mass_diff))? + .input("u", 1, libceed::EvalMode::Interp)? + .input("du", dim, libceed::EvalMode::Grad)? + .input("qdata", 1 + dim * (dim + 1) / 2, libceed::EvalMode::None)? + .output("v", 1, libceed::EvalMode::Interp)? + .output("dv", dim, libceed::EvalMode::Grad)?; + // -- QFunction for use with Operator + let qf_mass_diff = QFunctionOpt::SomeQFunction(&qf_mass_diff_closure); + + // Mass + diff Operator + let op_mass_diff = ceed + .operator(qf_mass_diff, QFunctionOpt::None, QFunctionOpt::None)? + .name("mass diff")? + .field("u", &rstr_solution, &basis_solution, VectorOpt::Active)? + .field("du", &rstr_solution, &basis_solution, VectorOpt::Active)? + .field("qdata", &rstr_qdata, BasisOpt::None, &qdata)? + .field("v", &rstr_solution, &basis_solution, VectorOpt::Active)? + .field("dv", &rstr_solution, &basis_solution, VectorOpt::Active)? + .check()?; + + // Solution vectors + let u = ceed.vector_from_slice(&vec![1.0; solution_size])?; + let mut v = ceed.vector(solution_size)?; + + // Apply the mass + diff operator + op_mass_diff.apply(&u, &mut v)?; + + // Compute the mesh volume + let volume: libceed::Scalar = v.view()?.iter().sum(); + + // Output results + if !quiet { + println!("Exact mesh volume : {:.12}", exact_volume); + println!("Computed mesh volume : {:.12}", volume); + println!( + "Volume error : {:.12e}", + volume - exact_volume + ); + } + let tolerance = match dim { + 1 => 200.0 * libceed::EPSILON, + _ => 1E-5, + }; + let error = (volume - exact_volume).abs(); + if error > tolerance { + println!("Volume error too large: {:.12e}", error); + return Err(libceed::Error { + message: format!( + "Volume error too large - expected: {:.12e}, actual: {:.12e}", + tolerance, error + ), + }); + } + Ok(()) +} + +// ---------------------------------------------------------------------------- +// Tests +// ---------------------------------------------------------------------------- +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn example_3_1d() { + let options = opt::Opt { + ceed_spec: "/cpu/self/ref/serial".to_string(), + dim: 1, + mesh_degree: 4, + solution_degree: 4, + num_qpts: 6, + problem_size_requested: -1, + test: true, + quiet: true, + }; + assert!(example_3(options).is_ok()); + } + + #[test] + fn example_3_2d() { + let options = opt::Opt { + ceed_spec: "/cpu/self/ref/serial".to_string(), + dim: 2, + mesh_degree: 4, + solution_degree: 4, + num_qpts: 6, + problem_size_requested: -1, + test: true, + quiet: true, + }; + assert!(example_3(options).is_ok()); + } + + #[test] + fn example_3_3d() { + let options = opt::Opt { + ceed_spec: "/cpu/self/ref/serial".to_string(), + dim: 3, + mesh_degree: 4, + solution_degree: 4, + num_qpts: 6, + problem_size_requested: -1, + test: true, + quiet: false, + }; + assert!(example_3(options).is_ok()); + } +} + +// ---------------------------------------------------------------------------- diff --git a/examples/rust/ex3-volume/src/opt.rs b/examples/rust/ex3-volume/src/opt.rs new file mode 100644 index 0000000000..edf546b032 --- /dev/null +++ b/examples/rust/ex3-volume/src/opt.rs @@ -0,0 +1,45 @@ +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. +// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +// +// SPDX-License-Identifier: BSD-2-Clause +// +// This file is part of CEED: http://github.com/ceed + +use clap::Parser; + +// ---------------------------------------------------------------------------- +// Command line arguments +// ---------------------------------------------------------------------------- +#[derive(Debug, Parser)] +#[command( + name = "libCEED Rust Example 3 - Volume", + about = "This example uses the mass matrix and diffusion matrices to compute the length, area, or volume of a region, depending upon runtime parameters." +)] +pub(crate) struct Opt { + /// libCEED backend resource to use + #[arg(name = "CEED", short, long = "ceed", default_value = "/cpu/self")] + pub(crate) ceed_spec: String, + /// Mesh dimension + #[arg(short, long = "dimension", default_value = "3")] + pub(crate) dim: usize, + /// Polynomial degree for the mesh + #[arg(short, long, default_value = "4")] + pub(crate) mesh_degree: usize, + /// Polynomial degree for the solution + #[arg(short = 'p', long, default_value = "4")] + pub(crate) solution_degree: usize, + /// Number of quadrature points in 1D + #[arg(short = 'q', long, default_value = "6")] + pub(crate) num_qpts: usize, + /// Approximate problem size + #[arg(name = "DoF", short = 's', long = "problem_size", default_value = "-1")] + pub(crate) problem_size_requested: i64, + /// Test mode + #[arg(short, long)] + pub(crate) test: bool, + /// Quiet mode + #[arg(short = 'x', long)] + pub(crate) quiet: bool, +} + +// ---------------------------------------------------------------------------- diff --git a/examples/rust/ex3-volume/src/transform.rs b/examples/rust/ex3-volume/src/transform.rs new file mode 100644 index 0000000000..e022a34860 --- /dev/null +++ b/examples/rust/ex3-volume/src/transform.rs @@ -0,0 +1,50 @@ +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. +// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +// +// SPDX-License-Identifier: BSD-2-Clause +// +// This file is part of CEED: http://github.com/ceed + +// ---------------------------------------------------------------------------- +// Transform mesh coordinates +// ---------------------------------------------------------------------------- +pub(crate) fn transform_mesh_coordinates( + dim: usize, + mesh_size: usize, + mesh_coords: &mut libceed::Vector, +) -> libceed::Result { + // Transform coordinates + match dim { + 1 => { + for coord in mesh_coords.view_mut()?.iter_mut() { + // map [0,1] to [0,1] varying the mesh density + *coord = 0.5 + + 1.0 / (3.0 as libceed::Scalar).sqrt() + * ((2.0 / 3.0) * std::f64::consts::PI as libceed::Scalar * (*coord - 0.5)) + .sin() + } + } + _ => { + let num_nodes = mesh_size / dim; + let mut coords = mesh_coords.view_mut()?; + for i in 0..num_nodes { + // map (x,y) from [0,1]x[0,1] to the quarter annulus with polar + // coordinates, (r,phi) in [1,2]x[0,pi/2] with area = 3/4*pi + let u = coords[i] + 1.; + let v = coords[i + num_nodes] * std::f64::consts::PI / 2.; + coords[i] = u * v.cos(); + coords[i + num_nodes] = u * v.sin(); + } + } + } + + // Exact volume of transformed region + let exact_volume = match dim { + 1 => 1., + 2 | 3 => 3. / 4. * std::f64::consts::PI, + _ => unreachable!(), + }; + Ok(exact_volume) +} + +// ---------------------------------------------------------------------------- diff --git a/examples/rust/mesh/src/lib.rs b/examples/rust/mesh/src/lib.rs index 9ad2810381..ce48153b18 100644 --- a/examples/rust/mesh/src/lib.rs +++ b/examples/rust/mesh/src/lib.rs @@ -1,11 +1,12 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause // // This file is part of CEED: http://github.com/ceed -use libceed::{prelude::*, Ceed}; +use libceed::{Ceed, ElemRestriction, Vector}; +use std::convert::TryInto; // ---------------------------------------------------------------------------- // Determine problem size in each dimension from size and dimenison @@ -22,16 +23,19 @@ pub fn cartesian_mesh_size(dim: usize, solution_degree: usize, problem_size: i64 // Size per dimension let mut r = s % dim; - let mut num_xyz = [0; 3]; - for d in 0..dim { - let mut sd = s / dim; - if r > 0 { - sd += 1; - r -= 1; - } - num_xyz[d] = 1 << sd; - } - num_xyz + let xyz: [usize; 3] = (0..3) + .map(|_| -> usize { + let mut sd = s / dim; + if r > 0 { + sd += 1; + r -= 1; + } + 1 << sd + }) + .collect::>() + .try_into() + .unwrap(); + xyz } // ---------------------------------------------------------------------------- @@ -44,7 +48,7 @@ pub fn build_cartesian_restriction( degree: usize, num_comp: usize, num_qpts: usize, -) -> libceed::Result<(ElemRestriction, ElemRestriction)> { +) -> libceed::Result<(ElemRestriction<'_>, ElemRestriction<'_>)> { let p = degree + 1; let num_nodes = p.pow(dim as u32); // number of nodes per element let elem_qpts = num_qpts.pow(dim as u32); // number of quadrature pts per element @@ -91,17 +95,17 @@ pub fn build_cartesian_restriction( num_comp, scalar_size, num_comp * scalar_size, - MemType::Host, + libceed::MemType::Host, &elem_nodes, )?; - // Quadratue data restriction + // Quadrature data restriction let rstr_qdata = ceed.strided_elem_restriction( num_elem, elem_qpts, num_comp, num_comp * elem_qpts * num_elem, - CEED_STRIDES_BACKEND, + libceed::CEED_STRIDES_BACKEND, )?; Ok((rstr, rstr_qdata)) } @@ -115,7 +119,7 @@ pub fn cartesian_mesh_coords( num_xyz: [usize; 3], mesh_degree: usize, mesh_size: usize, -) -> libceed::Result { +) -> libceed::Result> { let p = mesh_degree + 1; let mut num_d = [0; 3]; let mut scalar_size = 1; @@ -125,13 +129,14 @@ pub fn cartesian_mesh_coords( } // Lobatto points - let lobatto_basis = ceed.basis_tensor_H1_Lagrange(1, 1, 2, p, QuadMode::GaussLobatto)?; + let lobatto_basis = + ceed.basis_tensor_H1_Lagrange(1, 1, 2, p, libceed::QuadMode::GaussLobatto)?; let nodes_corners = ceed.vector_from_slice(&[0.0, 1.0])?; let mut nodes_full = ceed.vector(p)?; lobatto_basis.apply( 1, - TransposeMode::NoTranspose, - EvalMode::Interp, + libceed::TransposeMode::NoTranspose, + libceed::EvalMode::Interp, &nodes_corners, &mut nodes_full, )?; @@ -146,8 +151,9 @@ pub fn cartesian_mesh_coords( let mut r_nodes = gs_nodes; for d in 0..dim { let d_1d = r_nodes % num_d[d]; - coords[gs_nodes + scalar_size * d] = - ((d_1d / (p - 1)) as Scalar + nodes[d_1d % (p - 1)]) / num_xyz[d] as Scalar; + coords[gs_nodes + scalar_size * d] = ((d_1d / (p - 1)) as libceed::Scalar + + nodes[d_1d % (p - 1)]) + / num_xyz[d] as libceed::Scalar; r_nodes /= num_d[d]; } } diff --git a/examples/solids/Makefile b/examples/solids/Makefile index 484d71eda7..490b229acc 100644 --- a/examples/solids/Makefile +++ b/examples/solids/Makefile @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. # All Rights Reserved. See the top-level LICENSE and NOTICE files for details. # # SPDX-License-Identifier: BSD-2-Clause @@ -15,7 +15,7 @@ CEED_DIR ?= ../.. ceed.pc := $(CEED_DIR)/lib/pkgconfig/ceed.pc CC = $(call pkgconf, --variable=ccompiler $(PETSc.pc) $(ceed.pc)) -CFLAGS = -std=c99 \ +CFLAGS = -std=c11 \ $(call pkgconf, --variable=cflags_extra $(PETSc.pc)) \ $(call pkgconf, --cflags-only-other $(PETSc.pc)) \ $(OPT) diff --git a/examples/solids/README.md b/examples/solids/README.md index 4f0d14ab86..31b4651c3f 100644 --- a/examples/solids/README.md +++ b/examples/solids/README.md @@ -1,7 +1,8 @@ # libCEED: Solid Mechanics Example This page provides a description of the solid mechanics example for the libCEED library, based on PETSc. -PETSc v3.17 or a development version of PETSc at commit 0e95d842 or later is required. + +Ratel, a more fully featured solid mechanics library, can be found on [GitLab](https://gitlab.com/micromorph/ratel). This code solves the steady-state static momentum balance equations using unstructured high-order finite/spectral element spatial discretizations. In this mini-app, we consider three formulations used in solid mechanics applications: linear elasticity, Neo-Hookean hyperelasticity at small strain, and Neo-Hookean hyperelasticity at finite strain. @@ -21,7 +22,7 @@ and run with: ## Runtime options -% inclusion-solids-marker + The elasticity mini-app is controlled via command-line options, the following of which are mandatory. @@ -72,7 +73,7 @@ As an alternative example exploiting {code}`-dm_plex_box_faces`, we consider a { Sides 1 through 6 are rotated around $x$-axis: ``` -./elasticity -problem FSInitial-NH1 -E 1 -nu 0.3 -num_steps 40 -snes_linesearch_type cp -dm_plex_box_faces 4,4,4 -bc_clamp 1,2,3,4,5,6 -bc_clamp_1_rotate 0,0,1,0,.3 -bc_clamp_2_rotate 0,0,1,0,.3 -bc_clamp_3_rotate 0,0,1,0,.3 -bc_clamp_4_rotate 0,0,1,0,.3 -bc_clamp_5_rotate 0,0,1,0,.3 -bc_clamp_6_rotate 0,0,1,0,.3 +./elasticity -problem FS-NH -E 1 -nu 0.3 -num_steps 40 -snes_linesearch_type cp -dm_plex_box_faces 4,4,4 -bc_clamp 1,2,3,4,5,6 -bc_clamp_1_rotate 0,0,1,0,.3 -bc_clamp_2_rotate 0,0,1,0,.3 -bc_clamp_3_rotate 0,0,1,0,.3 -bc_clamp_4_rotate 0,0,1,0,.3 -bc_clamp_5_rotate 0,0,1,0,.3 -bc_clamp_6_rotate 0,0,1,0,.3 ``` :::{note} @@ -103,7 +104,7 @@ The command line options just shown are the minimum requirements to run the mini - * - `-problem` - - Problem to solve (`Linear`, `SS-NH`, `FSInitial-NH1`, etc.) + - Problem to solve (`Linear`, `FS-NH`, `FS-MR`, etc.) - `Linear` * - `-forcing` diff --git a/examples/solids/elasticity.c b/examples/solids/elasticity.c index 747f04835f..6c7db8fe97 100644 --- a/examples/solids/elasticity.c +++ b/examples/solids/elasticity.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -25,9 +25,9 @@ // // Sample meshes can be found at https://github.com/jeremylt/ceedSampleMeshes // -//TESTARGS(name="linear elasticity, MMS") -ceed {ceed_resource} -test -degree 3 -nu 0.3 -E 1 -dm_plex_box_faces 3,3,3 -//TESTARGS(name="Neo-Hookean hyperelasticity, initial configuration 1") -ceed {ceed_resource} -test -problem FSInitial-NH1 -E 2.8 -nu 0.4 -degree 2 -dm_plex_box_faces 2,2,2 -num_steps 1 -bc_clamp 6 -bc_traction 5 -bc_traction_5 0,0,-.5 -expect_final_strain_energy 2.124627916174e-01 -//TESTARGS(name="Mooney-Rivlin hyperelasticity, initial configuration 1") -ceed {ceed_resource} -test -problem FSInitial-MR1 -mu_1 .5 -mu_2 .5 -nu 0.4 -degree 2 -dm_plex_box_faces 2,2,2 -num_steps 1 -bc_clamp 6 -bc_traction 5 -bc_traction_5 0,0,-.5 -expect_final_strain_energy 2.339138880207e-01 +//TESTARGS(name="linear elasticity, MMS") -ceed {ceed_resource} -test -degree 3 -nu 0.3 -E 1 -dm_plex_box_faces 3,3,3 +//TESTARGS(name="Neo-Hookean hyperelasticity") -ceed {ceed_resource} -test -problem FS-NH -E 2.8 -nu 0.4 -degree 2 -dm_plex_box_faces 2,2,2 -num_steps 1 -bc_clamp 6 -bc_traction 5 -bc_traction_5 0,0,-.5 -expect_final_strain_energy 2.124627916174e-01 +//TESTARGS(name="Mooney-Rivlin hyperelasticity") -ceed {ceed_resource} -test -problem FS-MR -mu_1 .5 -mu_2 .5 -nu 0.4 -degree 2 -dm_plex_box_faces 2,2,2 -num_steps 1 -bc_clamp 6 -bc_traction 5 -bc_traction_5 0,0,-.5 -expect_final_strain_energy 2.339138880207e-01 /// @file /// CEED elasticity example using PETSc with DMPlex diff --git a/examples/solids/elasticity.h b/examples/solids/elasticity.h index 7ac246eccc..9458668b30 100644 --- a/examples/solids/elasticity.h +++ b/examples/solids/elasticity.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -21,6 +21,6 @@ #include "include/utils.h" #include "problems/problems.h" -#if PETSC_VERSION_LT(3, 21, 0) -#error "PETSc v3.21 or later is required" +#if PETSC_VERSION_LT(3, 23, 0) +#error "PETSc v3.23 or later is required" #endif diff --git a/examples/solids/include/boundary.h b/examples/solids/include/boundary.h index 7143b7c262..ca5916b682 100644 --- a/examples/solids/include/boundary.h +++ b/examples/solids/include/boundary.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause diff --git a/examples/solids/include/cl-options.h b/examples/solids/include/cl-options.h index 9c56398139..1d4b8fc962 100644 --- a/examples/solids/include/cl-options.h +++ b/examples/solids/include/cl-options.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause diff --git a/examples/solids/include/matops.h b/examples/solids/include/matops.h index ca57b33356..9b1fe843ba 100644 --- a/examples/solids/include/matops.h +++ b/examples/solids/include/matops.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause diff --git a/examples/solids/include/misc.h b/examples/solids/include/misc.h index 5836d14ff6..d6dc668b3f 100644 --- a/examples/solids/include/misc.h +++ b/examples/solids/include/misc.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause diff --git a/examples/solids/include/setup-dm.h b/examples/solids/include/setup-dm.h index 8fcfe7a63b..06c5347c18 100644 --- a/examples/solids/include/setup-dm.h +++ b/examples/solids/include/setup-dm.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause diff --git a/examples/solids/include/setup-libceed.h b/examples/solids/include/setup-libceed.h index be8ad14e9b..870f3bdf16 100644 --- a/examples/solids/include/setup-libceed.h +++ b/examples/solids/include/setup-libceed.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause diff --git a/examples/solids/include/structs.h b/examples/solids/include/structs.h index 8c63ce1199..f553002f93 100644 --- a/examples/solids/include/structs.h +++ b/examples/solids/include/structs.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause diff --git a/examples/solids/include/utils.h b/examples/solids/include/utils.h index 31188d47e3..709be45d3d 100644 --- a/examples/solids/include/utils.h +++ b/examples/solids/include/utils.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause diff --git a/examples/solids/index.md b/examples/solids/index.md index 910959b244..6d164bd9df 100644 --- a/examples/solids/index.md +++ b/examples/solids/index.md @@ -34,7 +34,7 @@ $$ (hyperelastic-cd) ## Running the mini-app ```{include} README.md -:start-after: inclusion-solids-marker +:start-after: ``` (problem-linear-elasticity)= @@ -526,254 +526,3 @@ In the case where complete linearization is preferred, note the symmetry $\maths Along with 6 entries for $\bm S$, this totals 27 entries of overhead compared to computing everything from $\bm F$. This compares with 13 entries of overhead for direct storage of $\{ \bm S, \bm C^{-1}, \log J \}$, which is sufficient for the Neo-Hookean model to avoid all but matrix products. ::: - -(problem-hyperelasticity-finite-strain-current-configuration)= - -## Hyperelasticity in current configuration - -In the preceeding discussion, all equations have been formulated in the initial configuration. -This may feel convenient in that the computational domain is clearly independent of the solution, but there are some advantages to defining the equations in the current configuration. - -1. Body forces (like gravity), traction, and contact are more easily defined in the current configuration. -2. Mesh quality in the initial configuration can be very bad for large deformation. -3. The required storage and numerical representation can be smaller in the current configuration. - -Most of the benefit in case 3 can be attained solely by moving the Jacobian representation to the current configuration {cite}`davydov2020matrix`, though residual evaluation may also be slightly faster in current configuration. -There are multiple commuting paths from the nonlinear weak form in initial configuration {eq}`hyperelastic-weak-form-initial` to the Jacobian weak form in current configuration {eq}`jacobian-weak-form-current`. -One may push forward to the current configuration and then linearize or linearize in initial configuration and then push forward, as summarized below. - -$$ -\begin{CD} - {\overbrace{\nabla_X \bm{v} \tcolon \bm{FS}}^{\text{Initial Residual}}} - @>{\text{push forward}}>{}> - {\overbrace{\nabla_x \bm{v} \tcolon \bm{\tau}}^{\text{Current Residual}}} \\ - @V{\text{linearize}}V{\begin{smallmatrix} \diff\bm F = \nabla_X\diff\bm u \\ \diff\bm S(\diff\bm E) \end{smallmatrix}}V - @V{\begin{smallmatrix} \diff\nabla_x\bm v = -\nabla_x\bm v \nabla_x \diff\bm u \\ \diff\bm\tau(\diff\bm\epsilon) \end{smallmatrix}}V{\text{linearize}}V \\ - {\underbrace{\nabla_X\bm{v}\tcolon \Big(\diff\bm{F}\bm{S} + \bm{F}\diff\bm{S}\Big)}_\text{Initial Jacobian}} - @>{\text{push forward}}>{}> - {\underbrace{\nabla_x\bm{v}\tcolon \Big(\diff\bm{\tau} -\bm{\tau}(\nabla_x \diff\bm{u})^T \Big)}_\text{Current Jacobian}} -\end{CD} -$$ (initial-current-linearize) - -We will follow both paths for consistency and because both intermediate representations may be useful for implementation. - -### Push forward, then linearize - -The first term of {eq}`hyperelastic-weak-form-initial` can be rewritten in terms of the symmetric Kirchhoff stress tensor -$\bm{\tau}=J\bm{\sigma}=\bm{P}\bm{F}^T = \bm F \bm S \bm F^T$ as - -$$ -\nabla_X \bm{v} \tcolon \bm{P} = \nabla_X \bm{v} \tcolon \bm{\tau}\bm{F}^{-T} = \nabla_X \bm{v}\bm{F}^{-1} \tcolon \bm{\tau} = \nabla_x \bm{v} \tcolon \bm{\tau} -$$ - -therefore, the weak form in terms of $\bm{\tau}$ and $\nabla_x$ with integral over $\Omega_0$ is - -$$ -\int_{\Omega_0}{\nabla_x \bm{v} \tcolon \bm{\tau}} \, dV - - \int_{\Omega_0}{\bm{v} \cdot \rho_0 \bm{g}} \, dV - - \int_{\partial \Omega_0}{\bm{v}\cdot(\bm{P}\cdot\hat{\bm{N}})} \, dS - = 0, \quad \forall \bm v \in \mathcal V. -$$ (hyperelastic-weak-form-current) - -#### Linearize in current configuration - -To derive a Newton linearization of {eq}`hyperelastic-weak-form-current`, first we define - -$$ -\nabla_x \diff \bm{u} = \nabla_X \diff \bm{u} \ \bm{F}^{-1} = \diff \bm{F} \bm{F}^{-1} -$$ (nabla_xdu) - -and $\bm{\tau}$ for Neo-Hookean materials as the push forward of {eq}`neo-hookean-stress` - -$$ -\bm{\tau} = \bm{F}\bm{S}\bm{F}^T = \mu (\bm{b} - \bm I_3) + \lambda \log J \bm{I}_3, -$$ (tau-neo-hookean) - -where $\bm{b} = \bm{F} \bm{F}^T$, is the left Cauchy-Green tensor. -Then by expanding the directional derivative of $\nabla_x \bm{v} \tcolon \bm{\tau}$, we arrive at - -$$ -\diff \ (\nabla_x \bm{v} \tcolon \bm{\tau}) = \diff \ (\nabla_x \bm{v})\tcolon \bm{\tau} + \nabla_x \bm{v} \tcolon \diff \bm{\tau} . -$$ (hyperelastic-linearization-current1) - -The first term of {eq}`hyperelastic-linearization-current1` can be written as - -$$ -\begin{aligned} \diff \ (\nabla_x \bm{v})\tcolon \bm{\tau} &= \diff \ (\nabla_X \bm{v} \bm{F}^{-1})\tcolon \bm{\tau} = \Big(\underbrace{\nabla_X (\diff \bm{v})}_{0}\bm{F}^{-1} + \nabla_X \bm{v}\diff \bm{F}^{-1}\Big)\tcolon \bm{\tau}\\ &= \Big(-\nabla_X \bm{v} \bm{F}^{-1}\diff\bm{F}\bm{F}^{-1}\Big)\tcolon \bm{\tau}=\Big(-\nabla_x \bm{v} \diff\bm{F}\bm{F}^{-1}\Big)\tcolon \bm{\tau}\\ &= \Big(-\nabla_x \bm{v} \nabla_x \diff\bm{u} \Big)\tcolon \bm{\tau}= -\nabla_x \bm{v}\tcolon\bm{\tau}(\nabla_x \diff\bm{u})^T \,, \end{aligned} -$$ - -where we have used $\diff \bm{F}^{-1}=-\bm{F}^{-1} \diff \bm{F} \bm{F}^{-1}$ and {eq}`nabla_xdu`. -Using this and {eq}`hyperelastic-linearization-current1` in {eq}`hyperelastic-weak-form-current` yields the weak form in the current configuration - -$$ -\int_{\Omega_0} \nabla_x \bm v \tcolon \Big(\diff\bm\tau - \bm\tau (\nabla_x \diff\bm u)^T \Big) = \text{rhs}. -$$ (jacobian-weak-form-current) - -In the following, we will sometimes make use of the incremental strain tensor in the current configuration, - -$$ -\diff\bm\epsilon \equiv \frac{1}{2}\Big(\nabla_x \diff\bm{u} + (\nabla_x \diff\bm{u})^T \Big) . -$$ - -:::{dropdown} Deriving $\diff\bm\tau$ for Neo-Hookean material -To derive a useful expression of $\diff\bm\tau$ for Neo-Hookean materials, we will use the representations - -$$ -\begin{aligned} -\diff \bm{b} &= \diff \bm{F} \bm{F}^T + \bm{F} \diff \bm{F}^T \\ -&= \nabla_x \diff \bm{u} \ \bm{b} + \bm{b} \ (\nabla_x \diff \bm{u})^T \\ -&= (\nabla_x \diff\bm u)(\bm b - \bm I_3) + (\bm b - \bm I_3) (\nabla_x \diff\bm u)^T + 2 \diff\bm\epsilon -\end{aligned} -$$ - -and - -$$ -\begin{aligned} \diff\ (\log J) &= \frac{\partial \log J}{\partial \bm{b}}\tcolon \diff \bm{b} = \frac{\partial J}{J\partial \bm{b}}\tcolon \diff \bm{b}=\frac{1}{2}\bm{b}^{-1}\tcolon \diff \bm{b} \\ &= \frac 1 2 \bm b^{-1} \tcolon \Big(\nabla_x \diff\bm u \ \bm b + \bm b (\nabla_x \diff\bm u)^T \Big) \\ &= \trace (\nabla_x \diff\bm u) \\ &= \trace \diff\bm\epsilon . \end{aligned} -$$ - -Substituting into {eq}`tau-neo-hookean` gives - -$$ -\begin{aligned} -\diff \bm{\tau} &= \mu \diff \bm{b} + \lambda \trace (\diff\bm\epsilon) \bm I_3 \\ -&= \underbrace{2 \mu \diff\bm\epsilon + \lambda \trace (\diff\bm\epsilon) \bm I_3 - 2\lambda \log J \diff\bm\epsilon}_{\bm F \diff\bm S \bm F^T} \\ -&\quad + (\nabla_x \diff\bm u)\underbrace{\Big( \mu (\bm b - \bm I_3) + \lambda \log J \bm I_3 \Big)}_{\bm\tau} \\ -&\quad + \underbrace{\Big( \mu (\bm b - \bm I_3) + \lambda \log J \bm I_3 \Big)}_{\bm\tau} (\nabla_x \diff\bm u)^T , -\end{aligned} -$$ (dtau-neo-hookean) - -where the final expression has been identified according to - -$$ -\diff\bm\tau = \diff\ (\bm F \bm S \bm F^T) = (\nabla_x \diff\bm u) \bm\tau + \bm F \diff\bm S \bm F^T + \bm\tau(\nabla_x \diff\bm u)^T. -$$ -::: - -Collecting terms, we may thus opt to use either of the two forms - -$$ -\begin{aligned} -\diff \bm{\tau} -\bm{\tau}(\nabla_x \diff\bm{u})^T &= (\nabla_x \diff\bm u)\bm\tau + \bm F \diff\bm S \bm F^T \\ -&= (\nabla_x \diff\bm u)\bm\tau + \lambda \trace(\diff\bm\epsilon) \bm I_3 + 2(\mu - \lambda \log J) \diff\bm\epsilon, -\end{aligned} -$$ (cur_simp_Jac) - -with the last line showing the especially compact representation available for Neo-Hookean materials. - -### Linearize, then push forward - -We can move the derivatives to the current configuration via - -$$ -\nabla_X \bm v \!:\! \diff\bm P = (\nabla_X \bm v) \bm F^{-1} \!:\! \diff \bm P \bm F^T = \nabla_x \bm v \!:\! \diff\bm P \bm F^T -$$ - -and expand - -$$ -\begin{aligned} -\diff\bm P \bm F^T &= \diff\bm F \bm S \bm F^T + \bm F \diff\bm S \bm F^T \\ -&= \underbrace{\diff\bm F \bm F^{-1}}_{\nabla_x \diff\bm u} \underbrace{\bm F \bm S \bm F^T}_{\bm\tau} + \bm F \diff\bm S \bm F^T . -\end{aligned} -$$ - -:::{dropdown} Representation of $\bm F \diff\bm S \bm F^T$ for Neo-Hookean materials -Now we push {eq}`eq-neo-hookean-incremental-stress` forward via - -$$ -\begin{aligned} -\bm F \diff\bm S \bm F^T &= \lambda (\bm C^{-1} \!:\! \diff\bm E) \bm F \bm C^{-1} \bm F^T - + 2 (\mu - \lambda \log J) \bm F \bm C^{-1} \diff\bm E \, \bm C^{-1} \bm F^T \\ - &= \lambda (\bm C^{-1} \!:\! \diff\bm E) \bm I_3 + 2 (\mu - \lambda \log J) \bm F^{-T} \diff\bm E \, \bm F^{-1} \\ - &= \lambda \operatorname{trace}(\nabla_x \diff\bm u) \bm I_3 + 2 (\mu - \lambda \log J) \diff\bm \epsilon -\end{aligned} -$$ - -where we have used - -$$ -\begin{aligned} -\bm C^{-1} \!:\! \diff\bm E &= \bm F^{-1} \bm F^{-T} \!:\! \bm F^T \diff\bm F \\ -&= \operatorname{trace}(\bm F^{-1} \bm F^{-T} \bm F^T \diff \bm F) \\ -&= \operatorname{trace}(\bm F^{-1} \diff\bm F) \\ -&= \operatorname{trace}(\diff \bm F \bm F^{-1}) \\ -&= \operatorname{trace}(\nabla_x \diff\bm u) -\end{aligned} -$$ - -and - -$$ -\begin{aligned} -\bm F^{-T} \diff\bm E \, \bm F^{-1} &= \frac 1 2 \bm F^{-T} (\bm F^T \diff\bm F + \diff\bm F^T \bm F) \bm F^{-1} \\ -&= \frac 1 2 (\diff \bm F \bm F^{-1} + \bm F^{-T} \diff\bm F^T) \\ -&= \frac 1 2 \Big(\nabla_x \diff\bm u + (\nabla_x\diff\bm u)^T \Big) \equiv \diff\bm\epsilon. -\end{aligned} -$$ -::: - -Collecting terms, the weak form of the Newton linearization for Neo-Hookean materials in the current configuration is - -$$ -\int_{\Omega_0} \nabla_x \bm v \!:\! \Big( (\nabla_x \diff\bm u) \bm\tau + \lambda \operatorname{trace}(\diff\bm\epsilon)\bm I_3 + 2(\mu - \lambda\log J)\diff \bm\epsilon \Big) dV = \text{rhs}, -$$ (jacobian-weak-form-current2) - -which equivalent to Algorithm 2 of {cite}`davydov2020matrix` and requires only derivatives with respect to the current configuration. Note that {eq}`cur_simp_Jac` and {eq}`jacobian-weak-form-current2` have recovered the same representation -using different algebraic manipulations. - -:::{tip} -We define a second order *Green-Euler* strain tensor (cf. Green-Lagrange strain {eq}`eq-green-lagrange-strain`) as - -$$ -\bm e = \frac 1 2 \Big(\bm{b} - \bm{I}_3 \Big) = \frac 1 2 \Big( \nabla_X \bm{u} + (\nabla_X \bm{u})^T + \nabla_X \bm{u} \, (\nabla_X \bm{u})^T \Big). -$$ (green-euler-strain) - -Then, the Kirchhoff stress tensor {eq}`tau-neo-hookean` can be written as - -$$ -\bm \tau = \lambda \log J \bm I_{3} + 2\mu \bm e, -$$ (tau-neo-hookean-stable) - -which is more numerically stable for small strain, and thus preferred for computation. Note that the $\log J$ is computed via `log1p` {eq}`log1p`, as we discussed in the previous tip. -::: - -### Jacobian representation - -We have implemented four storage variants for the Jacobian in our finite strain hyperelasticity. In each case, some variables are computed during residual evaluation and used during Jacobian application. - -:::{list-table} Four algorithms for Jacobian action in finite strain hyperelasticity problem -:header-rows: 1 -:widths: auto - -* - Option `-problem` - - Static storage - - Computed storage - - \# scalars - - Equations - - -* - `FSInitial-NH1` - - $\nabla_{X} \hat X, \operatorname{det}\nabla_{\hat X} X$ - - $\nabla_X \bm u$ - - 19 - - {eq}`eq-diff-P` {eq}`eq-neo-hookean-incremental-stress` - -* - `FSInitial-NH2` - - $\nabla_{X} \hat X, \operatorname{det}\nabla_{\hat X} X$ - - $\nabla_X \bm u, \bm C^{-1}, \lambda \log J$ - - 26 - - {eq}`eq-diff-P` {eq}`eq-neo-hookean-incremental-stress` - -* - `FSCurrent-NH1` - - $\nabla_{X} \hat X, \operatorname{det}\nabla_{\hat X} X$ - - $\nabla_X \bm u$ - - 19 - - {eq}`jacobian-weak-form-current` {eq}`nabla_xdu` - -* - `FSCurrent-NH2` - - $\operatorname{det}\nabla_{\hat X} X$ - - $\nabla_x \hat X, \bm \tau, \lambda \log J$ - - 17 - - {eq}`jacobian-weak-form-current` {eq}`jacobian-weak-form-current2` -::: diff --git a/examples/solids/problems/cl-problems.h b/examples/solids/problems/cl-problems.h index 249e1fd604..f596a01b60 100644 --- a/examples/solids/problems/cl-problems.h +++ b/examples/solids/problems/cl-problems.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -7,22 +7,7 @@ #pragma once // Problem options -typedef enum { - ELAS_LINEAR = 0, - ELAS_SS_NH = 1, - ELAS_FSInitial_NH1 = 2, - ELAS_FSInitial_NH2 = 3, - ELAS_FSCurrent_NH1 = 4, - ELAS_FSCurrent_NH2 = 5, - ELAS_FSInitial_MR1 = 6 -} problemType; -static const char *const problemTypes[] = {"Linear", "SS-NH", "FSInitial-NH1", "FSInitial-NH2", "FSCurrent-NH1", - "FSCurrent-NH2", "FSInitial-MR1", "problemType", "ELAS_", 0}; -static const char *const problemTypesForDisp[] = { - "Linear elasticity", - "Hyperelasticity small strain, Neo-Hookean", - "Hyperelasticity finite strain Initial configuration Neo-Hookean w/ dXref_dxinit, Grad(u) storage", - "Hyperelasticity finite strain Initial configuration Neo-Hookean w/ dXref_dxinit, Grad(u), C_inv, constant storage", - "Hyperelasticity finite strain Current configuration Neo-Hookean w/ dXref_dxinit, Grad(u) storage", - "Hyperelasticity finite strain Current configuration Neo-Hookean w/ dXref_dxcurr, tau, constant storage", - "Hyperelasticity finite strain Initial configuration Moony-Rivlin w/ dXref_dxinit, Grad(u) storage"}; +typedef enum { ELAS_LINEAR = 0, ELAS_FS_NH = 2, ELAS_FS_MR = 2 } problemType; +static const char *const problemTypes[] = {"Linear", "FS-NH", "FS-MR", "problemType", "ELAS_", 0}; +static const char *const problemTypesForDisp[] = {"Linear elasticity", "Hyperelasticity finite strain Initial configuration Neo-Hookean", + "Hyperelasticity finite strain Initial configuration Moony-Rivlin"}; diff --git a/examples/solids/problems/finite-strain-mooney-rivlin-initial-1.c b/examples/solids/problems/finite-strain-mooney-rivlin-initial-1.c deleted file mode 100644 index 57d37efd63..0000000000 --- a/examples/solids/problems/finite-strain-mooney-rivlin-initial-1.c +++ /dev/null @@ -1,58 +0,0 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. -// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. -// -// SPDX-License-Identifier: BSD-2-Clause -// -// This file is part of CEED: http://github.com/ceed - -#include "../qfunctions/finite-strain-mooney-rivlin-initial-1.h" - -#include -#include - -#include "../include/setup-libceed.h" -#include "../include/structs.h" -#include "../problems/mooney-rivlin.h" -#include "../problems/problems.h" -#include "../qfunctions/common.h" - -static const char *const field_names[] = {"gradu"}; -static CeedInt field_sizes[] = {9}; - -ProblemData finite_strain_Mooney_Rivlin_initial_1 = { - .setup_geo = SetupGeo, - .setup_geo_loc = SetupGeo_loc, - .q_data_size = 10, - .quadrature_mode = CEED_GAUSS, - .residual = ElasFSInitialMR1F, - .residual_loc = ElasFSInitialMR1F_loc, - .number_fields_stored = 1, - .field_names = field_names, - .field_sizes = field_sizes, - .jacobian = ElasFSInitialMR1dF, - .jacobian_loc = ElasFSInitialMR1dF_loc, - .energy = ElasFSInitialMR1Energy, - .energy_loc = ElasFSInitialMR1Energy_loc, - .diagnostic = ElasFSInitialMR1Diagnostic, - .diagnostic_loc = ElasFSInitialMR1Diagnostic_loc, -}; - -PetscErrorCode SetupLibceedFineLevel_ElasFSInitialMR1(DM dm, DM dm_energy, DM dm_diagnostic, Ceed ceed, AppCtx app_ctx, CeedQFunctionContext phys_ctx, - PetscInt fine_level, PetscInt num_comp_u, PetscInt U_g_size, PetscInt U_loc_size, - CeedVector force_ceed, CeedVector neumann_ceed, CeedData *data) { - PetscFunctionBegin; - - PetscCall(SetupLibceedFineLevel(dm, dm_energy, dm_diagnostic, ceed, app_ctx, phys_ctx, finite_strain_Mooney_Rivlin_initial_1, fine_level, - num_comp_u, U_g_size, U_loc_size, force_ceed, neumann_ceed, data)); - - PetscFunctionReturn(PETSC_SUCCESS); -}; - -PetscErrorCode SetupLibceedLevel_ElasFSInitialMR1(DM dm, Ceed ceed, AppCtx app_ctx, PetscInt level, PetscInt num_comp_u, PetscInt U_g_size, - PetscInt U_loc_size, CeedVector fine_mult, CeedData *data) { - PetscFunctionBegin; - - PetscCall(SetupLibceedLevel(dm, ceed, app_ctx, finite_strain_Mooney_Rivlin_initial_1, level, num_comp_u, U_g_size, U_loc_size, fine_mult, data)); - - PetscFunctionReturn(PETSC_SUCCESS); -}; diff --git a/examples/solids/problems/finite-strain-mooney-rivlin.c b/examples/solids/problems/finite-strain-mooney-rivlin.c new file mode 100644 index 0000000000..9798eeb26e --- /dev/null +++ b/examples/solids/problems/finite-strain-mooney-rivlin.c @@ -0,0 +1,58 @@ +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. +// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +// +// SPDX-License-Identifier: BSD-2-Clause +// +// This file is part of CEED: http://github.com/ceed + +#include "../qfunctions/finite-strain-mooney-rivlin.h" + +#include +#include + +#include "../include/setup-libceed.h" +#include "../include/structs.h" +#include "../problems/mooney-rivlin.h" +#include "../problems/problems.h" +#include "../qfunctions/common.h" + +static const char *const field_names[] = {"gradu"}; +static CeedInt field_sizes[] = {9}; + +ProblemData finite_strain_Mooney_Rivlin = { + .setup_geo = SetupGeo, + .setup_geo_loc = SetupGeo_loc, + .q_data_size = 10, + .quadrature_mode = CEED_GAUSS, + .residual = ElasFSResidual_MR, + .residual_loc = ElasFSResidual_MR_loc, + .number_fields_stored = 1, + .field_names = field_names, + .field_sizes = field_sizes, + .jacobian = ElasFSJacobian_MR, + .jacobian_loc = ElasFSJacobian_MR_loc, + .energy = ElasFSEnergy_MR, + .energy_loc = ElasFSEnergy_MR_loc, + .diagnostic = ElasFSDiagnostic_MR, + .diagnostic_loc = ElasFSDiagnostic_MR_loc, +}; + +PetscErrorCode SetupLibceedFineLevel_ElasFSMR(DM dm, DM dm_energy, DM dm_diagnostic, Ceed ceed, AppCtx app_ctx, CeedQFunctionContext phys_ctx, + PetscInt fine_level, PetscInt num_comp_u, PetscInt U_g_size, PetscInt U_loc_size, CeedVector force_ceed, + CeedVector neumann_ceed, CeedData *data) { + PetscFunctionBegin; + + PetscCall(SetupLibceedFineLevel(dm, dm_energy, dm_diagnostic, ceed, app_ctx, phys_ctx, finite_strain_Mooney_Rivlin, fine_level, num_comp_u, + U_g_size, U_loc_size, force_ceed, neumann_ceed, data)); + + PetscFunctionReturn(PETSC_SUCCESS); +}; + +PetscErrorCode SetupLibceedLevel_ElasFSMR(DM dm, Ceed ceed, AppCtx app_ctx, PetscInt level, PetscInt num_comp_u, PetscInt U_g_size, + PetscInt U_loc_size, CeedVector fine_mult, CeedData *data) { + PetscFunctionBegin; + + PetscCall(SetupLibceedLevel(dm, ceed, app_ctx, finite_strain_Mooney_Rivlin, level, num_comp_u, U_g_size, U_loc_size, fine_mult, data)); + + PetscFunctionReturn(PETSC_SUCCESS); +}; diff --git a/examples/solids/problems/finite-strain-neo-hookean-current-1.c b/examples/solids/problems/finite-strain-neo-hookean-current-1.c deleted file mode 100644 index e6ad6a8a99..0000000000 --- a/examples/solids/problems/finite-strain-neo-hookean-current-1.c +++ /dev/null @@ -1,58 +0,0 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. -// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. -// -// SPDX-License-Identifier: BSD-2-Clause -// -// This file is part of CEED: http://github.com/ceed - -#include "../qfunctions/finite-strain-neo-hookean-current-1.h" - -#include -#include - -#include "../include/setup-libceed.h" -#include "../include/structs.h" -#include "../problems/neo-hookean.h" -#include "../problems/problems.h" -#include "../qfunctions/common.h" - -static const char *const field_names[] = {"gradu"}; -static CeedInt field_sizes[] = {9}; - -ProblemData finite_strain_neo_Hookean_current_1 = { - .setup_geo = SetupGeo, - .setup_geo_loc = SetupGeo_loc, - .q_data_size = 10, - .quadrature_mode = CEED_GAUSS, - .residual = ElasFSCurrentNH1F, - .residual_loc = ElasFSCurrentNH1F_loc, - .number_fields_stored = 1, - .field_names = field_names, - .field_sizes = field_sizes, - .jacobian = ElasFSCurrentNH1dF, - .jacobian_loc = ElasFSCurrentNH1dF_loc, - .energy = ElasFSCurrentNH1Energy, - .energy_loc = ElasFSCurrentNH1Energy_loc, - .diagnostic = ElasFSCurrentNH1Diagnostic, - .diagnostic_loc = ElasFSCurrentNH1Diagnostic_loc, -}; - -PetscErrorCode SetupLibceedFineLevel_ElasFSCurrentNH1(DM dm, DM dm_energy, DM dm_diagnostic, Ceed ceed, AppCtx app_ctx, CeedQFunctionContext phys_ctx, - PetscInt fine_level, PetscInt num_comp_u, PetscInt U_g_size, PetscInt U_loc_size, - CeedVector force_ceed, CeedVector neumann_ceed, CeedData *data) { - PetscFunctionBegin; - - PetscCall(SetupLibceedFineLevel(dm, dm_energy, dm_diagnostic, ceed, app_ctx, phys_ctx, finite_strain_neo_Hookean_current_1, fine_level, num_comp_u, - U_g_size, U_loc_size, force_ceed, neumann_ceed, data)); - - PetscFunctionReturn(PETSC_SUCCESS); -}; - -PetscErrorCode SetupLibceedLevel_ElasFSCurrentNH1(DM dm, Ceed ceed, AppCtx app_ctx, PetscInt level, PetscInt num_comp_u, PetscInt U_g_size, - PetscInt U_loc_size, CeedVector fine_mult, CeedData *data) { - PetscFunctionBegin; - - PetscCall(SetupLibceedLevel(dm, ceed, app_ctx, finite_strain_neo_Hookean_current_1, level, num_comp_u, U_g_size, U_loc_size, fine_mult, data)); - - PetscFunctionReturn(PETSC_SUCCESS); -}; diff --git a/examples/solids/problems/finite-strain-neo-hookean-current-2.c b/examples/solids/problems/finite-strain-neo-hookean-current-2.c deleted file mode 100644 index 78f34d3ee7..0000000000 --- a/examples/solids/problems/finite-strain-neo-hookean-current-2.c +++ /dev/null @@ -1,58 +0,0 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. -// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. -// -// SPDX-License-Identifier: BSD-2-Clause -// -// This file is part of CEED: http://github.com/ceed - -#include "../qfunctions/finite-strain-neo-hookean-current-2.h" - -#include -#include - -#include "../include/setup-libceed.h" -#include "../include/structs.h" -#include "../problems/neo-hookean.h" -#include "../problems/problems.h" -#include "../qfunctions/common.h" - -static const char *const field_names[] = {"dXdx", "tau", "lambda_log_J"}; -static CeedInt field_sizes[] = {9, 6, 1}; - -ProblemData finite_strain_neo_Hookean_current_2 = { - .setup_geo = SetupGeo, - .setup_geo_loc = SetupGeo_loc, - .q_data_size = 10, - .quadrature_mode = CEED_GAUSS, - .residual = ElasFSCurrentNH2F, - .residual_loc = ElasFSCurrentNH2F_loc, - .number_fields_stored = 3, - .field_names = field_names, - .field_sizes = field_sizes, - .jacobian = ElasFSCurrentNH2dF, - .jacobian_loc = ElasFSCurrentNH2dF_loc, - .energy = ElasFSCurrentNH2Energy, - .energy_loc = ElasFSCurrentNH2Energy_loc, - .diagnostic = ElasFSCurrentNH2Diagnostic, - .diagnostic_loc = ElasFSCurrentNH2Diagnostic_loc, -}; - -PetscErrorCode SetupLibceedFineLevel_ElasFSCurrentNH2(DM dm, DM dm_energy, DM dm_diagnostic, Ceed ceed, AppCtx app_ctx, CeedQFunctionContext phys_ctx, - PetscInt fine_level, PetscInt num_comp_u, PetscInt U_g_size, PetscInt U_loc_size, - CeedVector force_ceed, CeedVector neumann_ceed, CeedData *data) { - PetscFunctionBegin; - - PetscCall(SetupLibceedFineLevel(dm, dm_energy, dm_diagnostic, ceed, app_ctx, phys_ctx, finite_strain_neo_Hookean_current_2, fine_level, num_comp_u, - U_g_size, U_loc_size, force_ceed, neumann_ceed, data)); - - PetscFunctionReturn(PETSC_SUCCESS); -}; - -PetscErrorCode SetupLibceedLevel_ElasFSCurrentNH2(DM dm, Ceed ceed, AppCtx app_ctx, PetscInt level, PetscInt num_comp_u, PetscInt U_g_size, - PetscInt U_loc_size, CeedVector fine_mult, CeedData *data) { - PetscFunctionBegin; - - PetscCall(SetupLibceedLevel(dm, ceed, app_ctx, finite_strain_neo_Hookean_current_2, level, num_comp_u, U_g_size, U_loc_size, fine_mult, data)); - - PetscFunctionReturn(PETSC_SUCCESS); -}; diff --git a/examples/solids/problems/finite-strain-neo-hookean-initial-1.c b/examples/solids/problems/finite-strain-neo-hookean-initial-1.c deleted file mode 100644 index cb45b602ad..0000000000 --- a/examples/solids/problems/finite-strain-neo-hookean-initial-1.c +++ /dev/null @@ -1,58 +0,0 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. -// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. -// -// SPDX-License-Identifier: BSD-2-Clause -// -// This file is part of CEED: http://github.com/ceed - -#include "../qfunctions/finite-strain-neo-hookean-initial-1.h" - -#include -#include - -#include "../include/setup-libceed.h" -#include "../include/structs.h" -#include "../problems/neo-hookean.h" -#include "../problems/problems.h" -#include "../qfunctions/common.h" - -static const char *const field_names[] = {"gradu"}; -static CeedInt field_sizes[] = {9}; - -ProblemData finite_strain_neo_Hookean_initial_1 = { - .setup_geo = SetupGeo, - .setup_geo_loc = SetupGeo_loc, - .q_data_size = 10, - .quadrature_mode = CEED_GAUSS, - .residual = ElasFSInitialNH1F, - .residual_loc = ElasFSInitialNH1F_loc, - .number_fields_stored = 1, - .field_names = field_names, - .field_sizes = field_sizes, - .jacobian = ElasFSInitialNH1dF, - .jacobian_loc = ElasFSInitialNH1dF_loc, - .energy = ElasFSInitialNH1Energy, - .energy_loc = ElasFSInitialNH1Energy_loc, - .diagnostic = ElasFSInitialNH1Diagnostic, - .diagnostic_loc = ElasFSInitialNH1Diagnostic_loc, -}; - -PetscErrorCode SetupLibceedFineLevel_ElasFSInitialNH1(DM dm, DM dm_energy, DM dm_diagnostic, Ceed ceed, AppCtx app_ctx, CeedQFunctionContext phys_ctx, - PetscInt fine_level, PetscInt num_comp_u, PetscInt U_g_size, PetscInt U_loc_size, - CeedVector force_ceed, CeedVector neumann_ceed, CeedData *data) { - PetscFunctionBegin; - - PetscCall(SetupLibceedFineLevel(dm, dm_energy, dm_diagnostic, ceed, app_ctx, phys_ctx, finite_strain_neo_Hookean_initial_1, fine_level, num_comp_u, - U_g_size, U_loc_size, force_ceed, neumann_ceed, data)); - - PetscFunctionReturn(PETSC_SUCCESS); -}; - -PetscErrorCode SetupLibceedLevel_ElasFSInitialNH1(DM dm, Ceed ceed, AppCtx app_ctx, PetscInt level, PetscInt num_comp_u, PetscInt U_g_size, - PetscInt U_loc_size, CeedVector fine_mult, CeedData *data) { - PetscFunctionBegin; - - PetscCall(SetupLibceedLevel(dm, ceed, app_ctx, finite_strain_neo_Hookean_initial_1, level, num_comp_u, U_g_size, U_loc_size, fine_mult, data)); - - PetscFunctionReturn(PETSC_SUCCESS); -}; diff --git a/examples/solids/problems/finite-strain-neo-hookean-initial-2.c b/examples/solids/problems/finite-strain-neo-hookean-initial-2.c deleted file mode 100644 index 9d52b35aec..0000000000 --- a/examples/solids/problems/finite-strain-neo-hookean-initial-2.c +++ /dev/null @@ -1,58 +0,0 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. -// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. -// -// SPDX-License-Identifier: BSD-2-Clause -// -// This file is part of CEED: http://github.com/ceed - -#include "../qfunctions/finite-strain-neo-hookean-initial-2.h" - -#include -#include - -#include "../include/setup-libceed.h" -#include "../include/structs.h" -#include "../problems/neo-hookean.h" -#include "../problems/problems.h" -#include "../qfunctions/common.h" - -static const char *const field_names[] = {"gradu", "C_inv", "lambda_log_J"}; -static CeedInt field_sizes[] = {9, 6, 1}; - -ProblemData finite_strain_neo_Hookean_initial_2 = { - .setup_geo = SetupGeo, - .setup_geo_loc = SetupGeo_loc, - .q_data_size = 10, - .quadrature_mode = CEED_GAUSS, - .residual = ElasFSInitialNH2F, - .residual_loc = ElasFSInitialNH2F_loc, - .number_fields_stored = 3, - .field_names = field_names, - .field_sizes = field_sizes, - .jacobian = ElasFSInitialNH2dF, - .jacobian_loc = ElasFSInitialNH2dF_loc, - .energy = ElasFSInitialNH2Energy, - .energy_loc = ElasFSInitialNH2Energy_loc, - .diagnostic = ElasFSInitialNH2Diagnostic, - .diagnostic_loc = ElasFSInitialNH2Diagnostic_loc, -}; - -PetscErrorCode SetupLibceedFineLevel_ElasFSInitialNH2(DM dm, DM dm_energy, DM dm_diagnostic, Ceed ceed, AppCtx app_ctx, CeedQFunctionContext phys_ctx, - PetscInt fine_level, PetscInt num_comp_u, PetscInt U_g_size, PetscInt U_loc_size, - CeedVector force_ceed, CeedVector neumann_ceed, CeedData *data) { - PetscFunctionBegin; - - PetscCall(SetupLibceedFineLevel(dm, dm_energy, dm_diagnostic, ceed, app_ctx, phys_ctx, finite_strain_neo_Hookean_initial_2, fine_level, num_comp_u, - U_g_size, U_loc_size, force_ceed, neumann_ceed, data)); - - PetscFunctionReturn(PETSC_SUCCESS); -}; - -PetscErrorCode SetupLibceedLevel_ElasFSInitialNH2(DM dm, Ceed ceed, AppCtx app_ctx, PetscInt level, PetscInt num_comp_u, PetscInt U_g_size, - PetscInt U_loc_size, CeedVector fine_mult, CeedData *data) { - PetscFunctionBegin; - - PetscCall(SetupLibceedLevel(dm, ceed, app_ctx, finite_strain_neo_Hookean_initial_2, level, num_comp_u, U_g_size, U_loc_size, fine_mult, data)); - - PetscFunctionReturn(PETSC_SUCCESS); -}; diff --git a/examples/solids/problems/small-strain-neo-hookean.c b/examples/solids/problems/finite-strain-neo-hookean.c similarity index 63% rename from examples/solids/problems/small-strain-neo-hookean.c rename to examples/solids/problems/finite-strain-neo-hookean.c index be2fb27c43..3948d257e3 100644 --- a/examples/solids/problems/small-strain-neo-hookean.c +++ b/examples/solids/problems/finite-strain-neo-hookean.c @@ -1,11 +1,11 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause // // This file is part of CEED: http://github.com/ceed -#include "../qfunctions/small-strain-neo-hookean.h" +#include "../qfunctions/finite-strain-neo-hookean.h" #include #include @@ -19,40 +19,40 @@ static const char *const field_names[] = {"gradu"}; static CeedInt field_sizes[] = {9}; -ProblemData small_strain_neo_Hookean = { +ProblemData finite_strain_neo_Hookean = { .setup_geo = SetupGeo, .setup_geo_loc = SetupGeo_loc, .q_data_size = 10, .quadrature_mode = CEED_GAUSS, - .residual = ElasSSNHF, - .residual_loc = ElasSSNHF_loc, + .residual = ElasFSResidual_NH, + .residual_loc = ElasFSResidual_NH_loc, .number_fields_stored = 1, .field_names = field_names, .field_sizes = field_sizes, - .jacobian = ElasSSNHdF, - .jacobian_loc = ElasSSNHdF_loc, - .energy = ElasSSNHEnergy, - .energy_loc = ElasSSNHEnergy_loc, - .diagnostic = ElasSSNHDiagnostic, - .diagnostic_loc = ElasSSNHDiagnostic_loc, + .jacobian = ElasFSJacobian_NH, + .jacobian_loc = ElasFSJacobian_NH_loc, + .energy = ElasFSEnergy_NH, + .energy_loc = ElasFSEnergy_NH_loc, + .diagnostic = ElasFSDiagnostic_NH, + .diagnostic_loc = ElasFSDiagnostic_NH_loc, }; -PetscErrorCode SetupLibceedFineLevel_ElasSSNH(DM dm, DM dm_energy, DM dm_diagnostic, Ceed ceed, AppCtx app_ctx, CeedQFunctionContext phys_ctx, +PetscErrorCode SetupLibceedFineLevel_ElasFSNH(DM dm, DM dm_energy, DM dm_diagnostic, Ceed ceed, AppCtx app_ctx, CeedQFunctionContext phys_ctx, PetscInt fine_level, PetscInt num_comp_u, PetscInt U_g_size, PetscInt U_loc_size, CeedVector force_ceed, CeedVector neumann_ceed, CeedData *data) { PetscFunctionBegin; - PetscCall(SetupLibceedFineLevel(dm, dm_energy, dm_diagnostic, ceed, app_ctx, phys_ctx, small_strain_neo_Hookean, fine_level, num_comp_u, U_g_size, + PetscCall(SetupLibceedFineLevel(dm, dm_energy, dm_diagnostic, ceed, app_ctx, phys_ctx, finite_strain_neo_Hookean, fine_level, num_comp_u, U_g_size, U_loc_size, force_ceed, neumann_ceed, data)); PetscFunctionReturn(PETSC_SUCCESS); }; -PetscErrorCode SetupLibceedLevel_ElasSSNH(DM dm, Ceed ceed, AppCtx app_ctx, PetscInt level, PetscInt num_comp_u, PetscInt U_g_size, +PetscErrorCode SetupLibceedLevel_ElasFSNH(DM dm, Ceed ceed, AppCtx app_ctx, PetscInt level, PetscInt num_comp_u, PetscInt U_g_size, PetscInt U_loc_size, CeedVector fine_mult, CeedData *data) { PetscFunctionBegin; - PetscCall(SetupLibceedLevel(dm, ceed, app_ctx, small_strain_neo_Hookean, level, num_comp_u, U_g_size, U_loc_size, fine_mult, data)); + PetscCall(SetupLibceedLevel(dm, ceed, app_ctx, finite_strain_neo_Hookean, level, num_comp_u, U_g_size, U_loc_size, fine_mult, data)); PetscFunctionReturn(PETSC_SUCCESS); }; diff --git a/examples/solids/problems/linear.c b/examples/solids/problems/linear.c index c013ee716a..82cb9635d1 100644 --- a/examples/solids/problems/linear.c +++ b/examples/solids/problems/linear.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -22,15 +22,15 @@ ProblemData linear_elasticity = { .setup_geo_loc = SetupGeo_loc, .q_data_size = 10, .quadrature_mode = CEED_GAUSS, - .residual = ElasLinearF, - .residual_loc = ElasLinearF_loc, + .residual = ElasResidual_Linear, + .residual_loc = ElasResidual_Linear_loc, .number_fields_stored = 0, - .jacobian = ElasLineardF, - .jacobian_loc = ElasLineardF_loc, - .energy = ElasLinearEnergy, - .energy_loc = ElasLinearEnergy_loc, - .diagnostic = ElasLinearDiagnostic, - .diagnostic_loc = ElasLinearDiagnostic_loc, + .jacobian = ElasJacobian_Linear, + .jacobian_loc = ElasJacobian_Linear_loc, + .energy = ElasEnergy_Linear, + .energy_loc = ElasEnergy_Linear_loc, + .diagnostic = ElasDiagnostic_Linear, + .diagnostic_loc = ElasDiagnostic_Linear_loc, .true_soln = MMSTrueSoln, .true_soln_loc = MMSTrueSoln_loc, }; diff --git a/examples/solids/problems/mooney-rivlin.c b/examples/solids/problems/mooney-rivlin.c index 2449e98742..4444250187 100644 --- a/examples/solids/problems/mooney-rivlin.c +++ b/examples/solids/problems/mooney-rivlin.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause diff --git a/examples/solids/problems/mooney-rivlin.h b/examples/solids/problems/mooney-rivlin.h index 2063e06e19..0903df99d6 100644 --- a/examples/solids/problems/mooney-rivlin.h +++ b/examples/solids/problems/mooney-rivlin.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause diff --git a/examples/solids/problems/neo-hookean.c b/examples/solids/problems/neo-hookean.c index 560717673e..dfd2d68005 100644 --- a/examples/solids/problems/neo-hookean.c +++ b/examples/solids/problems/neo-hookean.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause diff --git a/examples/solids/problems/neo-hookean.h b/examples/solids/problems/neo-hookean.h index 72f6a2ed65..a80e508001 100644 --- a/examples/solids/problems/neo-hookean.h +++ b/examples/solids/problems/neo-hookean.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause diff --git a/examples/solids/problems/problems.c b/examples/solids/problems/problems.c index e125997093..5a819aecf1 100644 --- a/examples/solids/problems/problems.c +++ b/examples/solids/problems/problems.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -14,12 +14,8 @@ PetscErrorCode RegisterProblems(ProblemFunctions problem_functions) { PetscFunctionBegin; SOLIDS_PROBLEM_REGISTER(problem_functions, "Linear", ElasLinear, NH); - SOLIDS_PROBLEM_REGISTER(problem_functions, "SS-NH", ElasSSNH, NH); - SOLIDS_PROBLEM_REGISTER(problem_functions, "FSCurrent-NH1", ElasFSCurrentNH1, NH); - SOLIDS_PROBLEM_REGISTER(problem_functions, "FSCurrent-NH2", ElasFSCurrentNH2, NH); - SOLIDS_PROBLEM_REGISTER(problem_functions, "FSInitial-NH1", ElasFSInitialNH1, NH); - SOLIDS_PROBLEM_REGISTER(problem_functions, "FSInitial-NH2", ElasFSInitialNH2, NH); - SOLIDS_PROBLEM_REGISTER(problem_functions, "FSInitial-MR1", ElasFSInitialMR1, MR); + SOLIDS_PROBLEM_REGISTER(problem_functions, "FS-NH", ElasFSNH, NH); + SOLIDS_PROBLEM_REGISTER(problem_functions, "FS-MR", ElasFSMR, MR); PetscFunctionReturn(PETSC_SUCCESS); }; diff --git a/examples/solids/problems/problems.h b/examples/solids/problems/problems.h index 17503fda72..e71ab1719b 100644 --- a/examples/solids/problems/problems.h +++ b/examples/solids/problems/problems.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -35,9 +35,5 @@ PetscErrorCode RegisterProblems(ProblemFunctions problem_functions); PetscInt u_loc_size, CeedVector fine_mult, CeedData *data); SOLIDS_PROBLEM(ElasLinear); -SOLIDS_PROBLEM(ElasSSNH); -SOLIDS_PROBLEM(ElasFSCurrentNH1); -SOLIDS_PROBLEM(ElasFSCurrentNH2); -SOLIDS_PROBLEM(ElasFSInitialNH1); -SOLIDS_PROBLEM(ElasFSInitialNH2); -SOLIDS_PROBLEM(ElasFSInitialMR1); +SOLIDS_PROBLEM(ElasFSNH); +SOLIDS_PROBLEM(ElasFSMR); diff --git a/examples/solids/qfunctions/common.h b/examples/solids/qfunctions/common.h index bfdb92522f..cf63c02a93 100644 --- a/examples/solids/qfunctions/common.h +++ b/examples/solids/qfunctions/common.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -8,7 +8,7 @@ /// @file /// Geometric factors for solid mechanics example using PETSc -#include +#include // ----------------------------------------------------------------------------- // This QFunction sets up the geometric factors required for integration and coordinate transformations diff --git a/examples/solids/qfunctions/constant-force.h b/examples/solids/qfunctions/constant-force.h index a94dc4f3bf..e37505c7e4 100644 --- a/examples/solids/qfunctions/constant-force.h +++ b/examples/solids/qfunctions/constant-force.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -8,8 +8,10 @@ /// @file /// Constant forcing term for solid mechanics example using PETSc -#include +#include +#ifndef CEED_RUNNING_JIT_PASS #include +#endif #ifndef PHYSICS_STRUCT #define PHYSICS_STRUCT diff --git a/examples/solids/qfunctions/finite-strain-mooney-rivlin-initial-1.h b/examples/solids/qfunctions/finite-strain-mooney-rivlin.h similarity index 95% rename from examples/solids/qfunctions/finite-strain-mooney-rivlin-initial-1.h rename to examples/solids/qfunctions/finite-strain-mooney-rivlin.h index 444b71d27f..9fc34b9ff5 100644 --- a/examples/solids/qfunctions/finite-strain-mooney-rivlin-initial-1.h +++ b/examples/solids/qfunctions/finite-strain-mooney-rivlin.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -8,8 +8,10 @@ /// @file /// Hyperelasticity, finite strain for solid mechanics example using PETSc -#include +#include +#ifndef CEED_RUNNING_JIT_PASS #include +#endif // ----------------------------------------------------------------------------- // Mooney-Rivlin context @@ -91,14 +93,14 @@ CEED_QFUNCTION_HELPER int computeMatinvSym(const CeedScalar A[3][3], const CeedS }; for (CeedInt m = 0; m < 6; m++) Ainv[m] = B[m] / (detA); - return 0; -}; + return CEED_ERROR_SUCCESS; +} #endif // ----------------------------------------------------------------------------- // Common computations between FS and dFS // ----------------------------------------------------------------------------- -CEED_QFUNCTION_HELPER int commonFSMR1(const CeedScalar mu_1, const CeedScalar mu_2, const CeedScalar lambda, const CeedScalar grad_u[3][3], - CeedScalar Swork[6], CeedScalar Cwork[6], CeedScalar Cinvwork[6], CeedScalar *logJ) { +CEED_QFUNCTION_HELPER int commonFSMR(const CeedScalar mu_1, const CeedScalar mu_2, const CeedScalar lambda, const CeedScalar grad_u[3][3], + CeedScalar Swork[6], CeedScalar Cwork[6], CeedScalar Cinvwork[6], CeedScalar *logJ) { // E - Green-Lagrange strain tensor // E = 1/2 (grad_u + grad_u^T + grad_u^T*grad_u) const CeedInt indj[6] = {0, 1, 2, 1, 0, 0}, indk[6] = {0, 1, 2, 2, 2, 1}; @@ -147,13 +149,13 @@ CEED_QFUNCTION_HELPER int commonFSMR1(const CeedScalar mu_1, const CeedScalar mu - mu_2 * Cwork[i]; } - return 0; -}; + return CEED_ERROR_SUCCESS; +} // ----------------------------------------------------------------------------- // Residual evaluation for hyperelasticity, finite strain // ----------------------------------------------------------------------------- -CEED_QFUNCTION(ElasFSInitialMR1F)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { +CEED_QFUNCTION(ElasFSResidual_MR)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { // Inputs const CeedScalar(*ug)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[0], (*q_data)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[1]; @@ -218,7 +220,7 @@ CEED_QFUNCTION(ElasFSInitialMR1F)(void *ctx, CeedInt Q, const CeedScalar *const // Common components of finite strain calculations CeedScalar Swork[6], Cwork[6], Cinvwork[6], logJ; - commonFSMR1(mu_1, mu_2, lambda, tempgradu, Swork, Cwork, Cinvwork, &logJ); + commonFSMR(mu_1, mu_2, lambda, tempgradu, Swork, Cwork, Cinvwork, &logJ); // Second Piola-Kirchhoff (S) const CeedScalar S[3][3] = { @@ -245,13 +247,13 @@ CEED_QFUNCTION(ElasFSInitialMR1F)(void *ctx, CeedInt Q, const CeedScalar *const } } // End of Quadrature Point Loop - return 0; + return CEED_ERROR_SUCCESS; } // ----------------------------------------------------------------------------- // Jacobian evaluation for hyperelasticity, finite strain // ----------------------------------------------------------------------------- -CEED_QFUNCTION(ElasFSInitialMR1dF)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { +CEED_QFUNCTION(ElasFSJacobian_MR)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { // Inputs const CeedScalar(*deltaug)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[0], (*q_data)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[1]; @@ -311,7 +313,7 @@ CEED_QFUNCTION(ElasFSInitialMR1dF)(void *ctx, CeedInt Q, const CeedScalar *const // Common components of finite strain calculations CeedScalar Swork[6], Cwork[6], Cinvwork[6], logJ; - commonFSMR1(mu_1, mu_2, lambda, tempgradu, Swork, Cwork, Cinvwork, &logJ); + commonFSMR(mu_1, mu_2, lambda, tempgradu, Swork, Cwork, Cinvwork, &logJ); // dE - Green-Lagrange strain tensor const CeedInt indj[6] = {0, 1, 2, 1, 0, 0}, indk[6] = {0, 1, 2, 2, 2, 1}; @@ -408,12 +410,13 @@ CEED_QFUNCTION(ElasFSInitialMR1dF)(void *ctx, CeedInt Q, const CeedScalar *const } } // End of Quadrature Point Loop - return 0; + return CEED_ERROR_SUCCESS; } + // ----------------------------------------------------------------------------- // Strain energy computation for hyperelasticity, finite strain // ----------------------------------------------------------------------------- -CEED_QFUNCTION(ElasFSInitialMR1Energy)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { +CEED_QFUNCTION(ElasFSEnergy_MR)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { // Inputs const CeedScalar(*ug)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[0], (*q_data)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[1]; @@ -498,13 +501,13 @@ CEED_QFUNCTION(ElasFSInitialMR1Energy)(void *ctx, CeedInt Q, const CeedScalar *c } // End of Quadrature Point Loop - return 0; + return CEED_ERROR_SUCCESS; } // ----------------------------------------------------------------------------- // Nodal diagnostic quantities for hyperelasticity, finite strain // ----------------------------------------------------------------------------- -CEED_QFUNCTION(ElasFSInitialMR1Diagnostic)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { +CEED_QFUNCTION(ElasFSDiagnostic_MR)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { // Inputs const CeedScalar(*u)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[0], (*ug)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[1], (*q_data)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[2]; @@ -605,6 +608,6 @@ CEED_QFUNCTION(ElasFSInitialMR1Diagnostic)(void *ctx, CeedInt Q, const CeedScala diagnostic[7][i] = (0.5 * lambda * logJ * logJ - (mu_1 + 2 * mu_2) * logJ + (mu_1 / 2.) * (I_1 - 3) + (mu_2 / 2.) * (I_2 - 3)); } // End of Quadrature Point Loop - return 0; + return CEED_ERROR_SUCCESS; } // ----------------------------------------------------------------------------- diff --git a/examples/solids/qfunctions/finite-strain-neo-hookean-current-1.h b/examples/solids/qfunctions/finite-strain-neo-hookean-current-1.h deleted file mode 100644 index 129a0af3b4..0000000000 --- a/examples/solids/qfunctions/finite-strain-neo-hookean-current-1.h +++ /dev/null @@ -1,535 +0,0 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. -// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. -// -// SPDX-License-Identifier: BSD-2-Clause -// -// This file is part of CEED: http://github.com/ceed - -/// @file -/// Hyperelasticity, finite strain for solid mechanics example using PETSc - -#include -#include - -#ifndef PHYSICS_STRUCT -#define PHYSICS_STRUCT -typedef struct Physics_private *Physics; -struct Physics_private { - CeedScalar nu; // Poisson's ratio - CeedScalar E; // Young's Modulus -}; -#endif - -// ----------------------------------------------------------------------------- -// Series approximation of log1p() -// log1p() is not vectorized in libc -// -// The series expansion is accurate to 1e-7 in the range sqrt(2)/2 < J < sqrt(2), with machine precision accuracy near J=1. -// The initialization extends this range to 0.35 ~= sqrt(2)/4 < J < sqrt(2)*2 ~= 2.83, which should be sufficient for applications of the Neo-Hookean -// model. -// ----------------------------------------------------------------------------- -#ifndef LOG1P_SERIES_SHIFTED -#define LOG1P_SERIES_SHIFTED -CEED_QFUNCTION_HELPER CeedScalar log1p_series_shifted(CeedScalar x) { - const CeedScalar left = sqrt(2.) / 2 - 1, right = sqrt(2.) - 1; - CeedScalar sum = 0; - if (1) { // Disable if the smaller range sqrt(2)/2 < J < sqrt(2) is sufficient - if (x < left) { // Replace if with while for arbitrary range (may hurt vectorization) - sum -= log(2.) / 2; - x = 1 + 2 * x; - } else if (right < x) { - sum += log(2.) / 2; - x = (x - 1) / 2; - } - } - CeedScalar y = x / (2. + x); - const CeedScalar y2 = y * y; - sum += y; - y *= y2; - sum += y / 3; - y *= y2; - sum += y / 5; - y *= y2; - sum += y / 7; - return 2 * sum; -}; -#endif - -// ----------------------------------------------------------------------------- -// Compute det F - 1 -// ----------------------------------------------------------------------------- -#ifndef DETJM1 -#define DETJM1 -CEED_QFUNCTION_HELPER CeedScalar computeJM1(const CeedScalar grad_u[3][3]) { - return grad_u[0][0] * (grad_u[1][1] * grad_u[2][2] - grad_u[1][2] * grad_u[2][1]) + - grad_u[0][1] * (grad_u[1][2] * grad_u[2][0] - grad_u[1][0] * grad_u[2][2]) + - grad_u[0][2] * (grad_u[1][0] * grad_u[2][1] - grad_u[2][0] * grad_u[1][1]) + grad_u[0][0] + grad_u[1][1] + grad_u[2][2] + - grad_u[0][0] * grad_u[1][1] + grad_u[0][0] * grad_u[2][2] + grad_u[1][1] * grad_u[2][2] - grad_u[0][1] * grad_u[1][0] - - grad_u[0][2] * grad_u[2][0] - grad_u[1][2] * grad_u[2][1]; -}; -#endif - -// ----------------------------------------------------------------------------- -// Compute matrix^(-1), where matrix is nonsymetric, returns array of 9 -// ----------------------------------------------------------------------------- -#ifndef MatinvNonSym -#define MatinvNonSym -CEED_QFUNCTION_HELPER int computeMatinvNonSym(const CeedScalar A[3][3], const CeedScalar detA, CeedScalar Ainv[9]) { - // Compute A^(-1) : A-Inverse - CeedScalar B[9] = { - A[1][1] * A[2][2] - A[1][2] * A[2][1], /* *NOPAD* */ - A[0][0] * A[2][2] - A[0][2] * A[2][0], /* *NOPAD* */ - A[0][0] * A[1][1] - A[0][1] * A[1][0], /* *NOPAD* */ - A[0][2] * A[1][0] - A[0][0] * A[1][2], /* *NOPAD* */ - A[0][1] * A[1][2] - A[0][2] * A[1][1], /* *NOPAD* */ - A[0][2] * A[2][1] - A[0][1] * A[2][2], /* *NOPAD* */ - A[0][1] * A[2][0] - A[0][0] * A[2][1], /* *NOPAD* */ - A[1][0] * A[2][1] - A[1][1] * A[2][0], /* *NOPAD* */ - A[1][2] * A[2][0] - A[1][0] * A[2][2] /* *NOPAD* */ - }; - for (CeedInt m = 0; m < 9; m++) Ainv[m] = B[m] / (detA); - - return 0; -}; -#endif - -// ----------------------------------------------------------------------------- -// Common computations between Ftau and dFtau -// ----------------------------------------------------------------------------- -CEED_QFUNCTION_HELPER int commonFtau(const CeedScalar lambda, const CeedScalar mu, const CeedScalar Grad_u[3][3], CeedScalar Finv[3][3], - CeedScalar tau_work[6], CeedScalar *llnj) { - // Compute The Deformation Gradient : F = I3 + Grad_u - const CeedScalar F[3][3] = { - {Grad_u[0][0] + 1, Grad_u[0][1], Grad_u[0][2] }, - {Grad_u[1][0], Grad_u[1][1] + 1, Grad_u[1][2] }, - {Grad_u[2][0], Grad_u[2][1], Grad_u[2][2] + 1} - }; - - // b - I3 = (Grad_u + Grad_u^T + Grad_u*Grad_u^T) - const CeedInt indj[6] = {0, 1, 2, 1, 0, 0}, indk[6] = {0, 1, 2, 2, 2, 1}; - CeedScalar bMI3[6]; - for (CeedInt m = 0; m < 6; m++) { - bMI3[m] = Grad_u[indj[m]][indk[m]] + Grad_u[indk[m]][indj[m]]; - for (CeedInt n = 0; n < 3; n++) bMI3[m] += Grad_u[indj[m]][n] * Grad_u[indk[m]][n]; - } - const CeedScalar Jm1 = computeJM1(Grad_u); - const CeedScalar logJ = log1p_series_shifted(Jm1); - - // Computer F^(-1) - const CeedScalar detF = Jm1 + 1.; - CeedScalar Finvwork[9]; - computeMatinvNonSym(F, detF, Finvwork); - - Finv[0][0] = Finvwork[0]; - Finv[0][1] = Finvwork[5]; - Finv[0][2] = Finvwork[4]; - Finv[1][0] = Finvwork[8]; - Finv[1][1] = Finvwork[1]; - Finv[1][2] = Finvwork[3]; - Finv[2][0] = Finvwork[7]; - Finv[2][1] = Finvwork[6]; - Finv[2][2] = Finvwork[2]; - - // Compute the Kirchhoff stress (tau) tau = mu*(b - I3) + lambda*log(J)*I3 - *llnj = lambda * logJ; - - tau_work[0] = mu * bMI3[0] + *llnj; - tau_work[1] = mu * bMI3[1] + *llnj; - tau_work[2] = mu * bMI3[2] + *llnj; - tau_work[3] = mu * bMI3[3]; - tau_work[4] = mu * bMI3[4]; - tau_work[5] = mu * bMI3[5]; - - return 0; -}; - -// ----------------------------------------------------------------------------- -// Residual evaluation for hyperelasticity, finite strain -// ----------------------------------------------------------------------------- -CEED_QFUNCTION(ElasFSCurrentNH1F)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { - // Inputs - const CeedScalar(*ug)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[0], (*q_data)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[1]; - - // Outputs - CeedScalar(*dvdX)[3][CEED_Q_VLA] = (CeedScalar(*)[3][CEED_Q_VLA])out[0]; - // Store grad_u for HyperFSdF (Jacobian of HyperFSF) - CeedScalar(*Grad_u)[3][CEED_Q_VLA] = (CeedScalar(*)[3][CEED_Q_VLA])out[1]; - - // Context - const Physics context = (Physics)ctx; - const CeedScalar E = context->E; - const CeedScalar nu = context->nu; - const CeedScalar TwoMu = E / (1 + nu); - const CeedScalar mu = TwoMu / 2; - const CeedScalar Kbulk = E / (3 * (1 - 2 * nu)); // Bulk Modulus - const CeedScalar lambda = (3 * Kbulk - TwoMu) / 3; - - // Formulation Terminology: - // I3 : 3x3 Identity matrix - // b : left Cauchy-Green tensor - // binv : inverse of b - // F : deformation gradient - // tau : Kirchhoff stress (in current config) - // Formulation: - // F = I3 + Grad_u - // J = det(F) - // b = F*F(^T) - // tau = mu*(b - I3) + lambda*log(J)*I3; - - // Quadrature Point Loop - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { - // Read spatial derivatives of u - const CeedScalar du[3][3] = { - {ug[0][0][i], ug[1][0][i], ug[2][0][i]}, - {ug[0][1][i], ug[1][1][i], ug[2][1][i]}, - {ug[0][2][i], ug[1][2][i], ug[2][2][i]} - }; - // -- Qdata - const CeedScalar wdetJ = q_data[0][i]; - // dXdx_initial = dX/dx_initial - // X is natural coordinate sys OR Reference [-1,1]^dim - // x_initial is initial config coordinate system - const CeedScalar dXdx_initial[3][3] = { - {q_data[1][i], q_data[2][i], q_data[3][i]}, - {q_data[4][i], q_data[5][i], q_data[6][i]}, - {q_data[7][i], q_data[8][i], q_data[9][i]} - }; - - // Compute Grad_u - // dXdx = (dx/dX)^(-1) - // Apply dXdx to du = grad_u - for (CeedInt j = 0; j < 3; j++) { // Component - for (CeedInt k = 0; k < 3; k++) { // Derivative - Grad_u[j][k][i] = 0; - for (CeedInt m = 0; m < 3; m++) Grad_u[j][k][i] += du[j][m] * dXdx_initial[m][k]; - } - } - - const CeedScalar tempGradu[3][3] = { - {Grad_u[0][0][i], Grad_u[0][1][i], Grad_u[0][2][i]}, - {Grad_u[1][0][i], Grad_u[1][1][i], Grad_u[1][2][i]}, - {Grad_u[2][0][i], Grad_u[2][1][i], Grad_u[2][2][i]} - }; - - // Common components of finite strain calculations - CeedScalar Finv[3][3], tau_work[6], llnj; - - commonFtau(lambda, mu, tempGradu, Finv, tau_work, &llnj); - const CeedScalar tau[3][3] = { - {tau_work[0], tau_work[5], tau_work[4]}, - {tau_work[5], tau_work[1], tau_work[3]}, - {tau_work[4], tau_work[3], tau_work[2]} - }; - // x is current config coordinate system - // dXdx = dX/dx = dX/dx_initial * F^{-1} - // Note that F^{-1} = dx_initial/dx - CeedScalar dXdx[3][3]; - for (CeedInt j = 0; j < 3; j++) { // Component - for (CeedInt k = 0; k < 3; k++) { // Derivative - dXdx[j][k] = 0; - for (CeedInt m = 0; m < 3; m++) dXdx[j][k] += dXdx_initial[j][m] * Finv[m][k]; - } - } - - // Apply dXdx^T and weight to intermediate stress - for (CeedInt j = 0; j < 3; j++) { // Component - for (CeedInt k = 0; k < 3; k++) { // Derivative - dvdX[k][j][i] = 0; - for (CeedInt m = 0; m < 3; m++) dvdX[k][j][i] += dXdx[k][m] * tau[j][m] * wdetJ; - } - } - - } // End of Quadrature Point Loop - - return 0; -} - -// ----------------------------------------------------------------------------- -// Jacobian evaluation for hyperelasticity, finite strain -// ----------------------------------------------------------------------------- -CEED_QFUNCTION(ElasFSCurrentNH1dF)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { - // Inputs - const CeedScalar(*deltaug)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[0], - (*q_data)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[1]; - // F is used for hyperelasticity (non-linear) - const CeedScalar(*Grad_u)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[2]; - - // Outputs - CeedScalar(*deltadvdX)[3][CEED_Q_VLA] = (CeedScalar(*)[3][CEED_Q_VLA])out[0]; - - // Context - const Physics context = (Physics)ctx; - const CeedScalar E = context->E; - const CeedScalar nu = context->nu; - - // Constants - const CeedScalar TwoMu = E / (1 + nu); - const CeedScalar mu = TwoMu / 2; - const CeedScalar Kbulk = E / (3 * (1 - 2 * nu)); // Bulk Modulus - const CeedScalar lambda = (3 * Kbulk - TwoMu) / 3; - - // Quadrature Point Loop - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { - // Read spatial derivatives of delta_u - const CeedScalar deltadu[3][3] = { - {deltaug[0][0][i], deltaug[1][0][i], deltaug[2][0][i]}, - {deltaug[0][1][i], deltaug[1][1][i], deltaug[2][1][i]}, - {deltaug[0][2][i], deltaug[1][2][i], deltaug[2][2][i]} - }; - // -- Qdata - const CeedScalar wdetJ = q_data[0][i]; - // dXdx_initial = dX/dx_initial - // X is natural coordinate sys OR Reference [-1,1]^dim - // x_initial is initial config coordinate system - const CeedScalar dXdx_initial[3][3] = { - {q_data[1][i], q_data[2][i], q_data[3][i]}, - {q_data[4][i], q_data[5][i], q_data[6][i]}, - {q_data[7][i], q_data[8][i], q_data[9][i]} - }; - - // Compute Grad_du - // dXdx = (dx/dX)^(-1) - // Apply dXdx to deltadu = graddelta - // This is dF = Grad_du - CeedScalar Grad_du[3][3]; - for (CeedInt j = 0; j < 3; j++) { // Component - for (CeedInt k = 0; k < 3; k++) { // Derivative - Grad_du[j][k] = 0; - for (CeedInt m = 0; m < 3; m++) Grad_du[j][k] += dXdx_initial[m][k] * deltadu[j][m]; - } - } - - const CeedScalar tempGradu[3][3] = { - {Grad_u[0][0][i], Grad_u[0][1][i], Grad_u[0][2][i]}, - {Grad_u[1][0][i], Grad_u[1][1][i], Grad_u[1][2][i]}, - {Grad_u[2][0][i], Grad_u[2][1][i], Grad_u[2][2][i]} - }; - - // Common components of finite strain calculations - CeedScalar F_inv[3][3], tau_work[6], llnj; - - // Common components of finite strain calculations (cur. config.) - commonFtau(lambda, mu, tempGradu, F_inv, tau_work, &llnj); - const CeedScalar tau[3][3] = { - {tau_work[0], tau_work[5], tau_work[4]}, - {tau_work[5], tau_work[1], tau_work[3]}, - {tau_work[4], tau_work[3], tau_work[2]} - }; - - // Compute grad_du = \nabla_x (deltau) = deltau * dX/dx - CeedScalar grad_du[3][3]; - for (CeedInt j = 0; j < 3; j++) { - for (CeedInt k = 0; k < 3; k++) { - grad_du[j][k] = 0; - for (CeedInt m = 0; m < 3; m++) grad_du[j][k] += deltadu[j][m] * F_inv[m][k]; - } - } - - // Compute grad_du_tau = grad_du*tau - CeedScalar grad_du_tau[3][3]; - for (CeedInt j = 0; j < 3; j++) { - for (CeedInt k = 0; k < 3; k++) { - grad_du_tau[j][k] = 0; - for (CeedInt m = 0; m < 3; m++) grad_du_tau[j][k] += grad_du[j][m] * tau[m][k]; - } - } - - // Compute depsilon = (grad_du + grad_du^T)/2 - const CeedScalar depsilon[3][3] = { - {(grad_du[0][0] + grad_du[0][0]) / 2., (grad_du[0][1] + grad_du[1][0]) / 2., (grad_du[0][2] + grad_du[2][0]) / 2.}, - {(grad_du[1][0] + grad_du[0][1]) / 2., (grad_du[1][1] + grad_du[1][1]) / 2., (grad_du[1][2] + grad_du[2][1]) / 2.}, - {(grad_du[2][0] + grad_du[0][2]) / 2., (grad_du[2][1] + grad_du[1][2]) / 2., (grad_du[2][2] + grad_du[2][2]) / 2.} - }; - // Compute trace(depsilon) - CeedScalar tr_deps = depsilon[0][0] + depsilon[1][1] + depsilon[2][2]; - // Compute grad_du*tau + trace(depsilon)I3 - grad_du_tau[0][0] += lambda * tr_deps; - grad_du_tau[1][1] += lambda * tr_deps; - grad_du_tau[2][2] += lambda * tr_deps; - // Compute dp = grad_du*tau + trace(depsilon)I3 +2(mu-lambda*logJ)depsilon - CeedScalar dp[3][3]; - for (CeedInt j = 0; j < 3; j++) { - for (CeedInt k = 0; k < 3; k++) { - dp[j][k] = grad_du_tau[j][k] + 2 * (mu - llnj) * depsilon[j][k]; - } - } - - // x is current config coordinate system - // dXdx = dX/dx = dX/dx_initial * F^{-1} - // Note that F^{-1} = dx_initial/dx - CeedScalar dXdx[3][3]; - for (CeedInt j = 0; j < 3; j++) { // Component - for (CeedInt k = 0; k < 3; k++) { // Derivative - dXdx[j][k] = 0; - for (CeedInt m = 0; m < 3; m++) dXdx[j][k] += dXdx_initial[j][m] * F_inv[m][k]; - } - } - - // Apply dXdx^T and weight - for (CeedInt j = 0; j < 3; j++) { // Component - for (CeedInt k = 0; k < 3; k++) { // Derivative - deltadvdX[k][j][i] = 0; - for (CeedInt m = 0; m < 3; m++) deltadvdX[k][j][i] += dXdx[k][m] * dp[j][m] * wdetJ; - } - } - - } // End of Quadrature Point Loop - - return 0; -} - -// ----------------------------------------------------------------------------- -// Strain energy computation for hyperelasticity, finite strain -// ----------------------------------------------------------------------------- -CEED_QFUNCTION(ElasFSCurrentNH1Energy)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { - // Inputs - const CeedScalar(*ug)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[0], (*q_data)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[1]; - - // Outputs - CeedScalar(*energy) = (CeedScalar(*))out[0]; - - // Context - const Physics context = (Physics)ctx; - const CeedScalar E = context->E; - const CeedScalar nu = context->nu; - const CeedScalar TwoMu = E / (1 + nu); - const CeedScalar mu = TwoMu / 2; - const CeedScalar Kbulk = E / (3 * (1 - 2 * nu)); // Bulk Modulus - const CeedScalar lambda = (3 * Kbulk - TwoMu) / 3; - - // Quadrature Point Loop - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { - // Read spatial derivatives of u - const CeedScalar du[3][3] = { - {ug[0][0][i], ug[1][0][i], ug[2][0][i]}, - {ug[0][1][i], ug[1][1][i], ug[2][1][i]}, - {ug[0][2][i], ug[1][2][i], ug[2][2][i]} - }; - // -- Qdata - const CeedScalar wdetJ = q_data[0][i]; - const CeedScalar dXdx[3][3] = { - {q_data[1][i], q_data[2][i], q_data[3][i]}, - {q_data[4][i], q_data[5][i], q_data[6][i]}, - {q_data[7][i], q_data[8][i], q_data[9][i]} - }; - - // Compute Grad_u - // dXdx = (dx/dX)^(-1) - // Apply dXdx to du = grad_u - CeedScalar Grad_u[3][3]; - for (int j = 0; j < 3; j++) { // Component - for (int k = 0; k < 3; k++) { // Derivative - Grad_u[j][k] = 0; - for (int m = 0; m < 3; m++) Grad_u[j][k] += dXdx[m][k] * du[j][m]; - } - } - - // E - Green-Lagrange strain tensor - // E = 1/2 (Grad_u + Grad_u^T + Grad_u^T*Grad_u) - const CeedInt indj[6] = {0, 1, 2, 1, 0, 0}, indk[6] = {0, 1, 2, 2, 2, 1}; - CeedScalar E2work[6]; - for (CeedInt m = 0; m < 6; m++) { - E2work[m] = Grad_u[indj[m]][indk[m]] + Grad_u[indk[m]][indj[m]]; - for (CeedInt n = 0; n < 3; n++) E2work[m] += Grad_u[n][indj[m]] * Grad_u[n][indk[m]]; - } - CeedScalar E2[3][3] = { - {E2work[0], E2work[5], E2work[4]}, - {E2work[5], E2work[1], E2work[3]}, - {E2work[4], E2work[3], E2work[2]} - }; - - // Strain energy Phi(E) for compressible Neo-Hookean - const CeedScalar Jm1 = computeJM1(Grad_u); - const CeedScalar logJ = log1p_series_shifted(Jm1); - energy[i] = (lambda * logJ * logJ / 2. - mu * logJ + mu * (E2[0][0] + E2[1][1] + E2[2][2]) / 2.) * wdetJ; - - } // End of Quadrature Point Loop - - return 0; -} - -// ----------------------------------------------------------------------------- -// Nodal diagnostic quantities for hyperelasticity, finite strain -// ----------------------------------------------------------------------------- -CEED_QFUNCTION(ElasFSCurrentNH1Diagnostic)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { - // Inputs - const CeedScalar(*u)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[0], (*ug)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[1], - (*q_data)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[2]; - - // Outputs - CeedScalar(*diagnostic)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[0]; - - // Context - const Physics context = (Physics)ctx; - const CeedScalar E = context->E; - const CeedScalar nu = context->nu; - const CeedScalar TwoMu = E / (1 + nu); - const CeedScalar mu = TwoMu / 2; - const CeedScalar Kbulk = E / (3 * (1 - 2 * nu)); // Bulk Modulus - const CeedScalar lambda = (3 * Kbulk - TwoMu) / 3; - - // Quadrature Point Loop - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { - // Read spatial derivatives of u - const CeedScalar du[3][3] = { - {ug[0][0][i], ug[1][0][i], ug[2][0][i]}, - {ug[0][1][i], ug[1][1][i], ug[2][1][i]}, - {ug[0][2][i], ug[1][2][i], ug[2][2][i]} - }; - // -- Qdata - const CeedScalar dXdx[3][3] = { - {q_data[1][i], q_data[2][i], q_data[3][i]}, - {q_data[4][i], q_data[5][i], q_data[6][i]}, - {q_data[7][i], q_data[8][i], q_data[9][i]} - }; - - // Compute Grad_u - // dXdx = (dx/dX)^(-1) - // Apply dXdx to du = Grad_u - CeedScalar Grad_u[3][3]; - for (int j = 0; j < 3; j++) { // Component - for (int k = 0; k < 3; k++) { // Derivative - Grad_u[j][k] = 0; - for (int m = 0; m < 3; m++) Grad_u[j][k] += dXdx[m][k] * du[j][m]; - } - } - - // E - Green-Lagrange strain tensor - // E = 1/2 (Grad_u + Grad_u^T + Grad_u^T*Grad_u) - const CeedInt indj[6] = {0, 1, 2, 1, 0, 0}, indk[6] = {0, 1, 2, 2, 2, 1}; - CeedScalar E2work[6]; - for (CeedInt m = 0; m < 6; m++) { - E2work[m] = Grad_u[indj[m]][indk[m]] + Grad_u[indk[m]][indj[m]]; - for (CeedInt n = 0; n < 3; n++) E2work[m] += Grad_u[n][indj[m]] * Grad_u[n][indk[m]]; - } - CeedScalar E2[3][3] = { - {E2work[0], E2work[5], E2work[4]}, - {E2work[5], E2work[1], E2work[3]}, - {E2work[4], E2work[3], E2work[2]} - }; - - // Displacement - diagnostic[0][i] = u[0][i]; - diagnostic[1][i] = u[1][i]; - diagnostic[2][i] = u[2][i]; - - // Pressure - const CeedScalar Jm1 = computeJM1(Grad_u); - const CeedScalar logJ = log1p_series_shifted(Jm1); - diagnostic[3][i] = -lambda * logJ; - - // Stress tensor invariants - diagnostic[4][i] = (E2[0][0] + E2[1][1] + E2[2][2]) / 2.; - diagnostic[5][i] = 0.; - for (CeedInt j = 0; j < 3; j++) { - for (CeedInt m = 0; m < 3; m++) diagnostic[5][i] += E2[j][m] * E2[m][j] / 4.; - } - diagnostic[6][i] = Jm1 + 1.; - - // Strain energy - diagnostic[7][i] = (lambda * logJ * logJ / 2. - mu * logJ + mu * (E2[0][0] + E2[1][1] + E2[2][2]) / 2.); - } // End of Quadrature Point Loop - - return 0; -} -// ----------------------------------------------------------------------------- diff --git a/examples/solids/qfunctions/finite-strain-neo-hookean-current-2.h b/examples/solids/qfunctions/finite-strain-neo-hookean-current-2.h deleted file mode 100644 index b03334f999..0000000000 --- a/examples/solids/qfunctions/finite-strain-neo-hookean-current-2.h +++ /dev/null @@ -1,482 +0,0 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. -// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. -// -// SPDX-License-Identifier: BSD-2-Clause -// -// This file is part of CEED: http://github.com/ceed - -/// @file -/// Hyperelasticity, finite strain for solid mechanics example using PETSc - -#include -#include - -#ifndef PHYSICS_STRUCT -#define PHYSICS_STRUCT -typedef struct Physics_private *Physics; -struct Physics_private { - CeedScalar nu; // Poisson's ratio - CeedScalar E; // Young's Modulus -}; -#endif - -// ----------------------------------------------------------------------------- -// Series approximation of log1p() -// log1p() is not vectorized in libc -// -// The series expansion is accurate to 1e-7 in the range sqrt(2)/2 < J < sqrt(2), with machine precision accuracy near J=1. -// The initialization extends this range to 0.35 ~= sqrt(2)/4 < J < sqrt(2)*2 ~= 2.83, which should be sufficient for applications of the Neo-Hookean -// model. -// ----------------------------------------------------------------------------- -#ifndef LOG1P_SERIES_SHIFTED -#define LOG1P_SERIES_SHIFTED -CEED_QFUNCTION_HELPER CeedScalar log1p_series_shifted(CeedScalar x) { - const CeedScalar left = sqrt(2.) / 2 - 1, right = sqrt(2.) - 1; - CeedScalar sum = 0; - if (1) { // Disable if the smaller range sqrt(2)/2 < J < sqrt(2) is sufficient - if (x < left) { // Replace if with while for arbitrary range (may hurt vectorization) - sum -= log(2.) / 2; - x = 1 + 2 * x; - } else if (right < x) { - sum += log(2.) / 2; - x = (x - 1) / 2; - } - } - CeedScalar y = x / (2. + x); - const CeedScalar y2 = y * y; - sum += y; - y *= y2; - sum += y / 3; - y *= y2; - sum += y / 5; - y *= y2; - sum += y / 7; - return 2 * sum; -}; -#endif - -// ----------------------------------------------------------------------------- -// Compute det F - 1 -// ----------------------------------------------------------------------------- -#ifndef DETJM1 -#define DETJM1 -CEED_QFUNCTION_HELPER CeedScalar computeJM1(const CeedScalar grad_u[3][3]) { - return grad_u[0][0] * (grad_u[1][1] * grad_u[2][2] - grad_u[1][2] * grad_u[2][1]) + - grad_u[0][1] * (grad_u[1][2] * grad_u[2][0] - grad_u[1][0] * grad_u[2][2]) + - grad_u[0][2] * (grad_u[1][0] * grad_u[2][1] - grad_u[2][0] * grad_u[1][1]) + grad_u[0][0] + grad_u[1][1] + grad_u[2][2] + - grad_u[0][0] * grad_u[1][1] + grad_u[0][0] * grad_u[2][2] + grad_u[1][1] * grad_u[2][2] - grad_u[0][1] * grad_u[1][0] - - grad_u[0][2] * grad_u[2][0] - grad_u[1][2] * grad_u[2][1]; -}; -#endif - -// ----------------------------------------------------------------------------- -// Compute matrix^(-1), where matrix is nonsymetric, returns array of 9 -// ----------------------------------------------------------------------------- -#ifndef MatinvNonSym -#define MatinvNonSym -CEED_QFUNCTION_HELPER int computeMatinvNonSym(const CeedScalar A[3][3], const CeedScalar detA, CeedScalar Ainv[9]) { - // Compute A^(-1) : A-Inverse - CeedScalar B[9] = { - A[1][1] * A[2][2] - A[1][2] * A[2][1], /* *NOPAD* */ - A[0][0] * A[2][2] - A[0][2] * A[2][0], /* *NOPAD* */ - A[0][0] * A[1][1] - A[0][1] * A[1][0], /* *NOPAD* */ - A[0][2] * A[1][0] - A[0][0] * A[1][2], /* *NOPAD* */ - A[0][1] * A[1][2] - A[0][2] * A[1][1], /* *NOPAD* */ - A[0][2] * A[2][1] - A[0][1] * A[2][2], /* *NOPAD* */ - A[0][1] * A[2][0] - A[0][0] * A[2][1], /* *NOPAD* */ - A[1][0] * A[2][1] - A[1][1] * A[2][0], /* *NOPAD* */ - A[1][2] * A[2][0] - A[1][0] * A[2][2] /* *NOPAD* */ - }; - for (CeedInt m = 0; m < 9; m++) Ainv[m] = B[m] / (detA); - - return 0; -}; -#endif - -// ----------------------------------------------------------------------------- -// Residual evaluation for hyperelasticity, finite strain -// ----------------------------------------------------------------------------- -CEED_QFUNCTION(ElasFSCurrentNH2F)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { - // Inputs - const CeedScalar(*ug)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[0], (*q_data)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[1]; - - // Outputs - CeedScalar(*dvdX)[3][CEED_Q_VLA] = (CeedScalar(*)[3][CEED_Q_VLA])out[0]; - // Store dXdx - CeedScalar(*dXdx)[3][CEED_Q_VLA] = (CeedScalar(*)[3][CEED_Q_VLA])out[1]; - // Store tau - CeedScalar(*tau)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[2]; - // Store constant lam_log_J = lambda*log(J) - CeedScalar(*lam_log_J)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[3]; - - // Context - const Physics context = (Physics)ctx; - const CeedScalar E = context->E; - const CeedScalar nu = context->nu; - const CeedScalar TwoMu = E / (1 + nu); - const CeedScalar mu = TwoMu / 2; - const CeedScalar Kbulk = E / (3 * (1 - 2 * nu)); // Bulk Modulus - const CeedScalar lambda = (3 * Kbulk - TwoMu) / 3; - - // Formulation Terminology: - // I3 : 3x3 Identity matrix - // b : left Cauchy-Green tensor - // F : deformation gradient - // tau : Kirchhoff stress (in current config) - // Formulation: - // F = I3 + Grad_ue - // J = det(F) - // b = F*F^{T} - // tau = mu*b - (mu - lambda*log(J))*I3; - - // Quadrature Point Loop - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { - // Read spatial derivatives of u - const CeedScalar du[3][3] = { - {ug[0][0][i], ug[1][0][i], ug[2][0][i]}, - {ug[0][1][i], ug[1][1][i], ug[2][1][i]}, - {ug[0][2][i], ug[1][2][i], ug[2][2][i]} - }; - // -- Qdata - const CeedScalar wdetJ = q_data[0][i]; - // dXdx_initial = dX/dx_initial - // X is natural coordinate sys OR Reference [-1,1]^dim - // x_initial is initial config coordinate system - const CeedScalar dXdx_initial[3][3] = { - {q_data[1][i], q_data[2][i], q_data[3][i]}, - {q_data[4][i], q_data[5][i], q_data[6][i]}, - {q_data[7][i], q_data[8][i], q_data[9][i]} - }; - - // X is natural coordinate sys OR Reference system - // x_initial is initial config coordinate system - // Grad_u =du/dx_initial= du/dX * dX/dx_initial - CeedScalar Grad_u[3][3]; - for (CeedInt j = 0; j < 3; j++) { // Component - for (CeedInt k = 0; k < 3; k++) { // Derivative - Grad_u[j][k] = 0; - for (CeedInt m = 0; m < 3; m++) Grad_u[j][k] += du[j][m] * dXdx_initial[m][k]; - } - } - - // Compute The Deformation Gradient : F = I3 + Gradu - const CeedScalar F[3][3] = { - {Grad_u[0][0] + 1, Grad_u[0][1], Grad_u[0][2] }, - {Grad_u[1][0], Grad_u[1][1] + 1, Grad_u[1][2] }, - {Grad_u[2][0], Grad_u[2][1], Grad_u[2][2] + 1} - }; - - // b - I3 = (Grad_u + Grad_u^T + Grad_u*Grad_u^T) - const CeedInt indj[6] = {0, 1, 2, 1, 0, 0}, indk[6] = {0, 1, 2, 2, 2, 1}; - CeedScalar bMI3[6]; - for (CeedInt m = 0; m < 6; m++) { - bMI3[m] = Grad_u[indj[m]][indk[m]] + Grad_u[indk[m]][indj[m]]; - for (CeedInt n = 0; n < 3; n++) bMI3[m] += Grad_u[indj[m]][n] * Grad_u[indk[m]][n]; - } - - const CeedScalar Jm1 = computeJM1(Grad_u); - const CeedScalar logJ = log1p_series_shifted(Jm1); - - // store lam_log_J = lambda*log(J) - lam_log_J[0][i] = lambda * logJ; - - // tau = mu*b - Cc1*I3; - tau[0][i] = mu * bMI3[0] + lam_log_J[0][i]; - tau[1][i] = mu * bMI3[1] + lam_log_J[0][i]; - tau[2][i] = mu * bMI3[2] + lam_log_J[0][i]; - tau[3][i] = mu * bMI3[3]; - tau[4][i] = mu * bMI3[4]; - tau[5][i] = mu * bMI3[5]; - - // Computer F^{-1} - const CeedScalar detF = Jm1 + 1.; - CeedScalar Finvwork[9]; - computeMatinvNonSym(F, detF, Finvwork); - CeedScalar Finv[3][3]; - Finv[0][0] = Finvwork[0]; - Finv[0][1] = Finvwork[5]; - Finv[0][2] = Finvwork[4]; - Finv[1][0] = Finvwork[8]; - Finv[1][1] = Finvwork[1]; - Finv[1][2] = Finvwork[3]; - Finv[2][0] = Finvwork[7]; - Finv[2][1] = Finvwork[6]; - Finv[2][2] = Finvwork[2]; - - // x is current config coordinate system - // dXdx = dX/dx = dX/dx_initial * F^{-1} - // Note that F^{-1} = dx_initial/dx - for (CeedInt j = 0; j < 3; j++) { // Component - for (CeedInt k = 0; k < 3; k++) { // Derivative - dXdx[j][k][i] = 0; - for (CeedInt m = 0; m < 3; m++) dXdx[j][k][i] += dXdx_initial[j][m] * Finv[m][k]; - } - } - - const CeedScalar temptau[3][3] = { - {tau[0][i], tau[5][i], tau[4][i]}, - {tau[5][i], tau[1][i], tau[3][i]}, - {tau[4][i], tau[3][i], tau[2][i]} - }; - // Apply dXdx^T and weight to intermediate stress - for (CeedInt j = 0; j < 3; j++) { // Component - for (CeedInt k = 0; k < 3; k++) { // Derivative - dvdX[k][j][i] = 0; - for (CeedInt m = 0; m < 3; m++) dvdX[k][j][i] += dXdx[k][m][i] * temptau[j][m] * wdetJ; - } - } - - } // End of Quadrature Point Loop - - return 0; -} - -// ----------------------------------------------------------------------------- -// Jacobian evaluation for hyperelasticity, finite strain -// ----------------------------------------------------------------------------- -CEED_QFUNCTION(ElasFSCurrentNH2dF)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { - // Inputs - const CeedScalar(*deltaug)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[0], - (*q_data)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[1]; - // dXdx computed in residual - const CeedScalar(*dXdx)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[2]; - // tau computed in residual - const CeedScalar(*tau)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[3]; - // lam_log_J = lambda*log(J) computed in residual - const CeedScalar(*lam_log_J)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[4]; - // Outputs - CeedScalar(*deltadvdX)[3][CEED_Q_VLA] = (CeedScalar(*)[3][CEED_Q_VLA])out[0]; - - // Context - const Physics context = (Physics)ctx; - const CeedScalar E = context->E; - const CeedScalar nu = context->nu; - - // Constants - const CeedScalar TwoMu = E / (1 + nu); - const CeedScalar mu = TwoMu / 2; - const CeedScalar Kbulk = E / (3 * (1 - 2 * nu)); // Bulk Modulus - const CeedScalar lambda = (3 * Kbulk - TwoMu) / 3; - - // Quadrature Point Loop - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { - // Read spatial derivatives of delta_u - const CeedScalar deltadu[3][3] = { - {deltaug[0][0][i], deltaug[1][0][i], deltaug[2][0][i]}, - {deltaug[0][1][i], deltaug[1][1][i], deltaug[2][1][i]}, - {deltaug[0][2][i], deltaug[1][2][i], deltaug[2][2][i]} - }; - // -- Qdata - const CeedScalar wdetJ = q_data[0][i]; - - // Compute grad_du = \nabla_x (deltau) = deltau * dX/dx - CeedScalar grad_du[3][3]; - for (CeedInt j = 0; j < 3; j++) { // Component - for (CeedInt k = 0; k < 3; k++) { // Derivative - grad_du[j][k] = 0; - for (CeedInt m = 0; m < 3; m++) grad_du[j][k] += deltadu[j][m] * dXdx[m][k][i]; - } - } - - const CeedScalar temptau[3][3] = { - {tau[0][i], tau[5][i], tau[4][i]}, - {tau[5][i], tau[1][i], tau[3][i]}, - {tau[4][i], tau[3][i], tau[2][i]} - }; - - // Compute grad_du_tau = grad_du*tau - CeedScalar grad_du_tau[3][3]; - for (CeedInt j = 0; j < 3; j++) { - for (CeedInt k = 0; k < 3; k++) { - grad_du_tau[j][k] = 0; - for (CeedInt m = 0; m < 3; m++) grad_du_tau[j][k] += grad_du[j][m] * temptau[m][k]; - } - } - - // Compute depsilon = (grad_du + grad_du^T)/2 - const CeedScalar depsilon[3][3] = { - {(grad_du[0][0] + grad_du[0][0]) / 2., (grad_du[0][1] + grad_du[1][0]) / 2., (grad_du[0][2] + grad_du[2][0]) / 2.}, - {(grad_du[1][0] + grad_du[0][1]) / 2., (grad_du[1][1] + grad_du[1][1]) / 2., (grad_du[1][2] + grad_du[2][1]) / 2.}, - {(grad_du[2][0] + grad_du[0][2]) / 2., (grad_du[2][1] + grad_du[1][2]) / 2., (grad_du[2][2] + grad_du[2][2]) / 2.} - }; - // Compute trace(depsilon) - CeedScalar tr_deps = depsilon[0][0] + depsilon[1][1] + depsilon[2][2]; - // Compute grad_du*tau + trace(depsilon)I3 - grad_du_tau[0][0] += lambda * tr_deps; - grad_du_tau[1][1] += lambda * tr_deps; - grad_du_tau[2][2] += lambda * tr_deps; - // Compute dp = grad_du*tau + trace(depsilon)I3 +2(mu-lambda*logJ)depsilon - CeedScalar dp[3][3]; - for (CeedInt j = 0; j < 3; j++) { - for (CeedInt k = 0; k < 3; k++) { - dp[j][k] = grad_du_tau[j][k] + 2 * (mu - lam_log_J[0][i]) * depsilon[j][k]; - } - } - - // Apply dXdx^T and weight - for (CeedInt j = 0; j < 3; j++) { // Component - for (CeedInt k = 0; k < 3; k++) { // Derivative - deltadvdX[k][j][i] = 0; - for (CeedInt m = 0; m < 3; m++) deltadvdX[k][j][i] += dXdx[k][m][i] * dp[j][m] * wdetJ; - } - } - } // End of Quadrature Point Loop - - return 0; -} - -// ----------------------------------------------------------------------------- -// Strain energy computation for hyperelasticity, finite strain -// ----------------------------------------------------------------------------- -CEED_QFUNCTION(ElasFSCurrentNH2Energy)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { - // Inputs - const CeedScalar(*ug)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[0], (*q_data)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[1]; - - // Outputs - CeedScalar(*energy) = (CeedScalar(*))out[0]; - - // Context - const Physics context = (Physics)ctx; - const CeedScalar E = context->E; - const CeedScalar nu = context->nu; - const CeedScalar TwoMu = E / (1 + nu); - const CeedScalar mu = TwoMu / 2; - const CeedScalar Kbulk = E / (3 * (1 - 2 * nu)); // Bulk Modulus - const CeedScalar lambda = (3 * Kbulk - TwoMu) / 3; - - // Quadrature Point Loop - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { - // Read spatial derivatives of u - const CeedScalar du[3][3] = { - {ug[0][0][i], ug[1][0][i], ug[2][0][i]}, - {ug[0][1][i], ug[1][1][i], ug[2][1][i]}, - {ug[0][2][i], ug[1][2][i], ug[2][2][i]} - }; - // -- Qdata - const CeedScalar wdetJ = q_data[0][i]; - const CeedScalar dXdx[3][3] = { - {q_data[1][i], q_data[2][i], q_data[3][i]}, - {q_data[4][i], q_data[5][i], q_data[6][i]}, - {q_data[7][i], q_data[8][i], q_data[9][i]} - }; - - // Compute Grad_u - // dXdx = (dx/dX)^(-1) - // Apply dXdx to du = grad_u - CeedScalar Grad_u[3][3]; - for (int j = 0; j < 3; j++) { // Component - for (int k = 0; k < 3; k++) { // Derivative - Grad_u[j][k] = 0; - for (int m = 0; m < 3; m++) Grad_u[j][k] += dXdx[m][k] * du[j][m]; - } - } - - // E - Green-Lagrange strain tensor - // E = 1/2 (Grad_u + Grad_u^T + Grad_u^T*Grad_u) - const CeedInt indj[6] = {0, 1, 2, 1, 0, 0}, indk[6] = {0, 1, 2, 2, 2, 1}; - CeedScalar E2work[6]; - for (CeedInt m = 0; m < 6; m++) { - E2work[m] = Grad_u[indj[m]][indk[m]] + Grad_u[indk[m]][indj[m]]; - for (CeedInt n = 0; n < 3; n++) E2work[m] += Grad_u[n][indj[m]] * Grad_u[n][indk[m]]; - } - CeedScalar E2[3][3] = { - {E2work[0], E2work[5], E2work[4]}, - {E2work[5], E2work[1], E2work[3]}, - {E2work[4], E2work[3], E2work[2]} - }; - const CeedScalar Jm1 = computeJM1(Grad_u); - const CeedScalar logJ = log1p_series_shifted(Jm1); - - // Strain energy Phi(E) for compressible Neo-Hookean - energy[i] = (lambda * logJ * logJ / 2. - mu * logJ + mu * (E2[0][0] + E2[1][1] + E2[2][2]) / 2.) * wdetJ; - - } // End of Quadrature Point Loop - - return 0; -} - -// ----------------------------------------------------------------------------- -// Nodal diagnostic quantities for hyperelasticity, finite strain -// ----------------------------------------------------------------------------- -CEED_QFUNCTION(ElasFSCurrentNH2Diagnostic)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { - // Inputs - const CeedScalar(*u)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[0], (*ug)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[1], - (*q_data)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[2]; - - // Outputs - CeedScalar(*diagnostic)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[0]; - - // Context - const Physics context = (Physics)ctx; - const CeedScalar E = context->E; - const CeedScalar nu = context->nu; - const CeedScalar TwoMu = E / (1 + nu); - const CeedScalar mu = TwoMu / 2; - const CeedScalar Kbulk = E / (3 * (1 - 2 * nu)); // Bulk Modulus - const CeedScalar lambda = (3 * Kbulk - TwoMu) / 3; - - // Quadrature Point Loop - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { - // Read spatial derivatives of u - const CeedScalar du[3][3] = { - {ug[0][0][i], ug[1][0][i], ug[2][0][i]}, - {ug[0][1][i], ug[1][1][i], ug[2][1][i]}, - {ug[0][2][i], ug[1][2][i], ug[2][2][i]} - }; - // -- Qdata - const CeedScalar dXdx[3][3] = { - {q_data[1][i], q_data[2][i], q_data[3][i]}, - {q_data[4][i], q_data[5][i], q_data[6][i]}, - {q_data[7][i], q_data[8][i], q_data[9][i]} - }; - - // Compute Grad_u - // dXdx = (dx/dX)^(-1) - // Apply dXdx to du = grad_u - CeedScalar Grad_u[3][3]; - for (int j = 0; j < 3; j++) { // Component - for (int k = 0; k < 3; k++) { // Derivative - Grad_u[j][k] = 0; - for (int m = 0; m < 3; m++) Grad_u[j][k] += dXdx[m][k] * du[j][m]; - } - } - - // E - Green-Lagrange strain tensor - // E = 1/2 (Grad_u + Grad_u^T + Grad_u^T*Grad_u) - const CeedInt indj[6] = {0, 1, 2, 1, 0, 0}, indk[6] = {0, 1, 2, 2, 2, 1}; - CeedScalar E2work[6]; - for (CeedInt m = 0; m < 6; m++) { - E2work[m] = Grad_u[indj[m]][indk[m]] + Grad_u[indk[m]][indj[m]]; - for (CeedInt n = 0; n < 3; n++) E2work[m] += Grad_u[n][indj[m]] * Grad_u[n][indk[m]]; - } - CeedScalar E2[3][3] = { - {E2work[0], E2work[5], E2work[4]}, - {E2work[5], E2work[1], E2work[3]}, - {E2work[4], E2work[3], E2work[2]} - }; - - // Displacement - diagnostic[0][i] = u[0][i]; - diagnostic[1][i] = u[1][i]; - diagnostic[2][i] = u[2][i]; - - // Pressure - const CeedScalar Jm1 = computeJM1(Grad_u); - const CeedScalar logJ = log1p_series_shifted(Jm1); - diagnostic[3][i] = -lambda * logJ; - - // Stress tensor invariants - diagnostic[4][i] = (E2[0][0] + E2[1][1] + E2[2][2]) / 2.; - diagnostic[5][i] = 0.; - for (CeedInt j = 0; j < 3; j++) { - for (CeedInt m = 0; m < 3; m++) diagnostic[5][i] += E2[j][m] * E2[m][j] / 4.; - } - diagnostic[6][i] = Jm1 + 1.; - - // Strain energy - diagnostic[7][i] = (lambda * logJ * logJ / 2. - mu * logJ + mu * (E2[0][0] + E2[1][1] + E2[2][2]) / 2.); - } // End of Quadrature Point Loop - - return 0; -} -// ----------------------------------------------------------------------------- diff --git a/examples/solids/qfunctions/finite-strain-neo-hookean-initial-2.h b/examples/solids/qfunctions/finite-strain-neo-hookean-initial-2.h deleted file mode 100644 index 09c4bb99ce..0000000000 --- a/examples/solids/qfunctions/finite-strain-neo-hookean-initial-2.h +++ /dev/null @@ -1,559 +0,0 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. -// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. -// -// SPDX-License-Identifier: BSD-2-Clause -// -// This file is part of CEED: http://github.com/ceed - -/// @file -/// Hyperelasticity, finite strain for solid mechanics example using PETSc - -#include -#include - -#ifndef PHYSICS_STRUCT -#define PHYSICS_STRUCT -typedef struct Physics_private *Physics; -struct Physics_private { - CeedScalar nu; // Poisson's ratio - CeedScalar E; // Young's Modulus -}; -#endif - -// ----------------------------------------------------------------------------- -// Series approximation of log1p() -// log1p() is not vectorized in libc -// -// The series expansion is accurate to 1e-7 in the range sqrt(2)/2 < J < sqrt(2), with machine precision accuracy near J=1. -// The initialization extends this range to 0.35 ~= sqrt(2)/4 < J < sqrt(2)*2 ~= 2.83, which should be sufficient for applications of the Neo-Hookean -// model. -// ----------------------------------------------------------------------------- -#ifndef LOG1P_SERIES_SHIFTED -#define LOG1P_SERIES_SHIFTED -CEED_QFUNCTION_HELPER CeedScalar log1p_series_shifted(CeedScalar x) { - const CeedScalar left = sqrt(2.) / 2 - 1, right = sqrt(2.) - 1; - CeedScalar sum = 0; - if (1) { // Disable if the smaller range sqrt(2)/2 < J < sqrt(2) is sufficient - if (x < left) { // Replace if with while for arbitrary range (may hurt vectorization) - sum -= log(2.) / 2; - x = 1 + 2 * x; - } else if (right < x) { - sum += log(2.) / 2; - x = (x - 1) / 2; - } - } - CeedScalar y = x / (2. + x); - const CeedScalar y2 = y * y; - sum += y; - y *= y2; - sum += y / 3; - y *= y2; - sum += y / 5; - y *= y2; - sum += y / 7; - return 2 * sum; -}; -#endif - -// ----------------------------------------------------------------------------- -// Compute det F - 1 -// ----------------------------------------------------------------------------- -#ifndef DETJM1 -#define DETJM1 -CEED_QFUNCTION_HELPER CeedScalar computeJM1(const CeedScalar grad_u[3][3]) { - return grad_u[0][0] * (grad_u[1][1] * grad_u[2][2] - grad_u[1][2] * grad_u[2][1]) + - grad_u[0][1] * (grad_u[1][2] * grad_u[2][0] - grad_u[1][0] * grad_u[2][2]) + - grad_u[0][2] * (grad_u[1][0] * grad_u[2][1] - grad_u[2][0] * grad_u[1][1]) + grad_u[0][0] + grad_u[1][1] + grad_u[2][2] + - grad_u[0][0] * grad_u[1][1] + grad_u[0][0] * grad_u[2][2] + grad_u[1][1] * grad_u[2][2] - grad_u[0][1] * grad_u[1][0] - - grad_u[0][2] * grad_u[2][0] - grad_u[1][2] * grad_u[2][1]; -}; -#endif - -// ----------------------------------------------------------------------------- -// Compute matrix^(-1), where matrix is symetric, returns array of 6 -// ----------------------------------------------------------------------------- -#ifndef MatinvSym -#define MatinvSym -CEED_QFUNCTION_HELPER int computeMatinvSym(const CeedScalar A[3][3], const CeedScalar detA, CeedScalar Ainv[6]) { - // Compute A^(-1) : A-Inverse - CeedScalar B[6] = { - A[1][1] * A[2][2] - A[1][2] * A[2][1], /* *NOPAD* */ - A[0][0] * A[2][2] - A[0][2] * A[2][0], /* *NOPAD* */ - A[0][0] * A[1][1] - A[0][1] * A[1][0], /* *NOPAD* */ - A[0][2] * A[1][0] - A[0][0] * A[1][2], /* *NOPAD* */ - A[0][1] * A[1][2] - A[0][2] * A[1][1], /* *NOPAD* */ - A[0][2] * A[2][1] - A[0][1] * A[2][2] /* *NOPAD* */ - }; - for (CeedInt m = 0; m < 6; m++) Ainv[m] = B[m] / (detA); - - return 0; -}; -#endif - -// ----------------------------------------------------------------------------- -// Residual evaluation for hyperelasticity, finite strain -// ----------------------------------------------------------------------------- -CEED_QFUNCTION(ElasFSInitialNH2F)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { - // Inputs - const CeedScalar(*ug)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[0], (*q_data)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[1]; - - // Outputs - CeedScalar(*dvdX)[3][CEED_Q_VLA] = (CeedScalar(*)[3][CEED_Q_VLA])out[0]; - // Store grad_u for HyperFSdF (Jacobian of HyperFSF) - CeedScalar(*grad_u)[3][CEED_Q_VLA] = (CeedScalar(*)[3][CEED_Q_VLA])out[1]; - // Store C_inv for HyperFSdF (Jacobian of HyperFSF) - CeedScalar(*C_inv)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[2]; - // Store constant lam_log_J = lambda*log(J) - CeedScalar(*lam_log_J)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[3]; - - // Context - const Physics context = (Physics)ctx; - const CeedScalar E = context->E; - const CeedScalar nu = context->nu; - const CeedScalar TwoMu = E / (1 + nu); - const CeedScalar mu = TwoMu / 2; - const CeedScalar Kbulk = E / (3 * (1 - 2 * nu)); // Bulk Modulus - const CeedScalar lambda = (3 * Kbulk - TwoMu) / 3; - - // Formulation Terminology: - // I3 : 3x3 Identity matrix - // C : right Cauchy-Green tensor - // C_inv : inverse of C - // F : deformation gradient - // S : 2nd Piola-Kirchhoff (in current config) - // P : 1st Piola-Kirchhoff (in referential config) - // Formulation: - // F = I3 + grad_ue - // J = det(F) - // C = F(^T)*F - // S = mu*I3 + (lambda*log(J)-mu)*C_inv; - // P = F*S - - // Quadrature Point Loop - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { - // Read spatial derivatives of u - const CeedScalar du[3][3] = { - {ug[0][0][i], ug[1][0][i], ug[2][0][i]}, - {ug[0][1][i], ug[1][1][i], ug[2][1][i]}, - {ug[0][2][i], ug[1][2][i], ug[2][2][i]} - }; - // -- Qdata - const CeedScalar wdetJ = q_data[0][i]; - const CeedScalar dXdx[3][3] = { - {q_data[1][i], q_data[2][i], q_data[3][i]}, - {q_data[4][i], q_data[5][i], q_data[6][i]}, - {q_data[7][i], q_data[8][i], q_data[9][i]} - }; - - // Compute grad_u - // dXdx = (dx/dX)^(-1) - // Apply dXdx to du = grad_u - for (CeedInt j = 0; j < 3; j++) { // Component - for (CeedInt k = 0; k < 3; k++) { // Derivative - grad_u[j][k][i] = 0; - for (CeedInt m = 0; m < 3; m++) grad_u[j][k][i] += dXdx[m][k] * du[j][m]; - } - } - - // I3 : 3x3 Identity matrix - // Compute The Deformation Gradient : F = I3 + grad_u - const CeedScalar F[3][3] = { - {grad_u[0][0][i] + 1, grad_u[0][1][i], grad_u[0][2][i] }, - {grad_u[1][0][i], grad_u[1][1][i] + 1, grad_u[1][2][i] }, - {grad_u[2][0][i], grad_u[2][1][i], grad_u[2][2][i] + 1} - }; - - // E - Green-Lagrange strain tensor - // E = 1/2 (grad_u + grad_u^T + grad_u^T*grad_u) - const CeedInt indj[6] = {0, 1, 2, 1, 0, 0}, indk[6] = {0, 1, 2, 2, 2, 1}; - CeedScalar E2work[6]; - for (CeedInt m = 0; m < 6; m++) { - E2work[m] = grad_u[indj[m]][indk[m]][i] + grad_u[indk[m]][indj[m]][i]; - for (CeedInt n = 0; n < 3; n++) E2work[m] += grad_u[n][indj[m]][i] * grad_u[n][indk[m]][i]; - } - - const CeedScalar tempgradu[3][3] = { - {grad_u[0][0][i], grad_u[0][1][i], grad_u[0][2][i]}, - {grad_u[1][0][i], grad_u[1][1][i], grad_u[1][2][i]}, - {grad_u[2][0][i], grad_u[2][1][i], grad_u[2][2][i]} - }; - - const CeedScalar Jm1 = computeJM1(tempgradu); - const CeedScalar logJ = log1p_series_shifted(Jm1); - // store lam_log_J = lambda*log(J) - lam_log_J[0][i] = lambda * logJ; - - CeedScalar E2[3][3] = { - {E2work[0], E2work[5], E2work[4]}, - {E2work[5], E2work[1], E2work[3]}, - {E2work[4], E2work[3], E2work[2]} - }; - - // C : right Cauchy-Green tensor - // C = I + 2E - const CeedScalar C[3][3] = { - {1 + E2[0][0], E2[0][1], E2[0][2] }, - {E2[0][1], 1 + E2[1][1], E2[1][2] }, - {E2[0][2], E2[1][2], 1 + E2[2][2]} - }; - - // Compute C^(-1) : C-Inverse - const CeedScalar detC = (Jm1 + 1.) * (Jm1 + 1.); - CeedScalar Cinvwork[6]; - computeMatinvSym(C, detC, Cinvwork); - - // store C_inv - C_inv[0][i] = Cinvwork[0]; - C_inv[1][i] = Cinvwork[1]; - C_inv[2][i] = Cinvwork[2]; - C_inv[3][i] = Cinvwork[3]; - C_inv[4][i] = Cinvwork[4]; - C_inv[5][i] = Cinvwork[5]; - - const CeedScalar tempCinv[3][3] = { - {C_inv[0][i], C_inv[5][i], C_inv[4][i]}, - {C_inv[5][i], C_inv[1][i], C_inv[3][i]}, - {C_inv[4][i], C_inv[3][i], C_inv[2][i]} - }; - CeedScalar Swork[6]; - for (CeedInt m = 0; m < 6; m++) { - Swork[m] = lam_log_J[0][i] * C_inv[m][i]; - for (CeedInt n = 0; n < 3; n++) Swork[m] += mu * tempCinv[indj[m]][n] * E2[n][indk[m]]; - } - // Second Piola-Kirchhoff (S) - const CeedScalar S[3][3] = { - {Swork[0], Swork[5], Swork[4]}, - {Swork[5], Swork[1], Swork[3]}, - {Swork[4], Swork[3], Swork[2]} - }; - - // Compute the First Piola-Kirchhoff : P = F*S - CeedScalar P[3][3]; - for (CeedInt j = 0; j < 3; j++) { - for (CeedInt k = 0; k < 3; k++) { - P[j][k] = 0; - for (CeedInt m = 0; m < 3; m++) P[j][k] += F[j][m] * S[m][k]; - } - } - - // Apply dXdx^T and weight to P (First Piola-Kirchhoff) - for (CeedInt j = 0; j < 3; j++) { // Component - for (CeedInt k = 0; k < 3; k++) { // Derivative - dvdX[k][j][i] = 0; - for (CeedInt m = 0; m < 3; m++) dvdX[k][j][i] += dXdx[k][m] * P[j][m] * wdetJ; - } - } - } // End of Quadrature Point Loop - - return 0; -} - -// ----------------------------------------------------------------------------- -// Jacobian evaluation for hyperelasticity, finite strain -// ----------------------------------------------------------------------------- -CEED_QFUNCTION(ElasFSInitialNH2dF)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { - // Inputs - const CeedScalar(*deltaug)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[0], - (*q_data)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[1]; - // grad_u is used for hyperelasticity (non-linear) - const CeedScalar(*grad_u)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[2]; - const CeedScalar(*C_inv)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[3]; - // lam_log_J = lambda*log(J) - const CeedScalar(*lam_log_J)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[4]; - // Outputs - CeedScalar(*deltadvdX)[3][CEED_Q_VLA] = (CeedScalar(*)[3][CEED_Q_VLA])out[0]; - - // Context - const Physics context = (Physics)ctx; - const CeedScalar E = context->E; - const CeedScalar nu = context->nu; - - // Constants - const CeedScalar TwoMu = E / (1 + nu); - const CeedScalar mu = TwoMu / 2; - const CeedScalar Kbulk = E / (3 * (1 - 2 * nu)); // Bulk Modulus - const CeedScalar lambda = (3 * Kbulk - TwoMu) / 3; - - // Quadrature Point Loop - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { - // Read spatial derivatives of delta_u - const CeedScalar deltadu[3][3] = { - {deltaug[0][0][i], deltaug[1][0][i], deltaug[2][0][i]}, - {deltaug[0][1][i], deltaug[1][1][i], deltaug[2][1][i]}, - {deltaug[0][2][i], deltaug[1][2][i], deltaug[2][2][i]} - }; - // -- Qdata - const CeedScalar wdetJ = q_data[0][i]; - const CeedScalar dXdx[3][3] = { - {q_data[1][i], q_data[2][i], q_data[3][i]}, - {q_data[4][i], q_data[5][i], q_data[6][i]}, - {q_data[7][i], q_data[8][i], q_data[9][i]} - }; - - // Compute graddeltau - // dXdx = (dx/dX)^(-1) - // Apply dXdx to deltadu = graddelta - CeedScalar graddeltau[3][3]; - for (CeedInt j = 0; j < 3; j++) { // Component - for (CeedInt k = 0; k < 3; k++) { // Derivative - graddeltau[j][k] = 0; - for (CeedInt m = 0; m < 3; m++) graddeltau[j][k] += dXdx[m][k] * deltadu[j][m]; - } - } - - // I3 : 3x3 Identity matrix - // Deformation Gradient : F = I3 + grad_u - const CeedScalar F[3][3] = { - {grad_u[0][0][i] + 1, grad_u[0][1][i], grad_u[0][2][i] }, - {grad_u[1][0][i], grad_u[1][1][i] + 1, grad_u[1][2][i] }, - {grad_u[2][0][i], grad_u[2][1][i], grad_u[2][2][i] + 1} - }; - // E - Green-Lagrange strain tensor - // E = 1/2 (grad_u + grad_u^T + grad_u^T*grad_u) - const CeedInt indj[6] = {0, 1, 2, 1, 0, 0}, indk[6] = {0, 1, 2, 2, 2, 1}; - CeedScalar E2work[6]; - for (CeedInt m = 0; m < 6; m++) { - E2work[m] = grad_u[indj[m]][indk[m]][i] + grad_u[indk[m]][indj[m]][i]; - for (CeedInt n = 0; n < 3; n++) E2work[m] += grad_u[n][indj[m]][i] * grad_u[n][indk[m]][i]; - } - - // deltaE - Green-Lagrange strain tensor - CeedScalar deltaEwork[6]; - for (CeedInt m = 0; m < 6; m++) { - deltaEwork[m] = 0; - for (CeedInt n = 0; n < 3; n++) deltaEwork[m] += (graddeltau[n][indj[m]] * F[n][indk[m]] + F[n][indj[m]] * graddeltau[n][indk[m]]) / 2.; - } - CeedScalar deltaE[3][3] = { - {deltaEwork[0], deltaEwork[5], deltaEwork[4]}, - {deltaEwork[5], deltaEwork[1], deltaEwork[3]}, - {deltaEwork[4], deltaEwork[3], deltaEwork[2]} - }; - - CeedScalar E2[3][3] = { - {E2work[0], E2work[5], E2work[4]}, - {E2work[5], E2work[1], E2work[3]}, - {E2work[4], E2work[3], E2work[2]} - }; - - const CeedScalar tempCinv[3][3] = { - {C_inv[0][i], C_inv[5][i], C_inv[4][i]}, - {C_inv[5][i], C_inv[1][i], C_inv[3][i]}, - {C_inv[4][i], C_inv[3][i], C_inv[2][i]} - }; - CeedScalar Swork[6]; - for (CeedInt m = 0; m < 6; m++) { - Swork[m] = lam_log_J[0][i] * C_inv[m][i]; - for (CeedInt n = 0; n < 3; n++) Swork[m] += mu * tempCinv[indj[m]][n] * E2[n][indk[m]]; - } - // Second Piola-Kirchhoff (S) - const CeedScalar S[3][3] = { - {Swork[0], Swork[5], Swork[4]}, - {Swork[5], Swork[1], Swork[3]}, - {Swork[4], Swork[3], Swork[2]} - }; - - // deltaS = dSdE:deltaE - // = lambda(C_inv:deltaE)C_inv + 2(mu-lambda*log(J))C_inv*deltaE*C_inv - // -- C_inv:deltaE - CeedScalar Cinv_contract_E = 0; - for (CeedInt j = 0; j < 3; j++) { - for (CeedInt k = 0; k < 3; k++) Cinv_contract_E += tempCinv[j][k] * deltaE[j][k]; - } - // -- deltaE*C_inv - CeedScalar deltaECinv[3][3]; - for (CeedInt j = 0; j < 3; j++) { - for (CeedInt k = 0; k < 3; k++) { - deltaECinv[j][k] = 0; - for (CeedInt m = 0; m < 3; m++) deltaECinv[j][k] += deltaE[j][m] * tempCinv[m][k]; - } - } - // -- intermediate deltaS = C_inv*deltaE*C_inv - CeedScalar deltaS[3][3]; - for (CeedInt j = 0; j < 3; j++) { - for (CeedInt k = 0; k < 3; k++) { - deltaS[j][k] = 0; - for (CeedInt m = 0; m < 3; m++) deltaS[j][k] += tempCinv[j][m] * deltaECinv[m][k]; - } - } - // -- deltaS = lambda(C_inv:deltaE)C_inv - 2(lambda*log(J)-mu)*(intermediate) - const CeedScalar llnj_m = lam_log_J[0][i] - mu; - for (CeedInt j = 0; j < 3; j++) { - for (CeedInt k = 0; k < 3; k++) deltaS[j][k] = lambda * Cinv_contract_E * tempCinv[j][k] - 2. * llnj_m * deltaS[j][k]; - } - - // deltaP = dPdF:deltaF = deltaF*S + F*deltaS - CeedScalar deltaP[3][3]; - for (CeedInt j = 0; j < 3; j++) { - for (CeedInt k = 0; k < 3; k++) { - deltaP[j][k] = 0; - for (CeedInt m = 0; m < 3; m++) deltaP[j][k] += graddeltau[j][m] * S[m][k] + F[j][m] * deltaS[m][k]; - } - } - - // Apply dXdx^T and weight - for (CeedInt j = 0; j < 3; j++) { // Component - for (CeedInt k = 0; k < 3; k++) { // Derivative - deltadvdX[k][j][i] = 0; - for (CeedInt m = 0; m < 3; m++) deltadvdX[k][j][i] += dXdx[k][m] * deltaP[j][m] * wdetJ; - } - } - } // End of Quadrature Point Loop - - return 0; -} - -// ----------------------------------------------------------------------------- -// Strain energy computation for hyperelasticity, finite strain -// ----------------------------------------------------------------------------- -CEED_QFUNCTION(ElasFSInitialNH2Energy)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { - // Inputs - const CeedScalar(*ug)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[0], (*q_data)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[1]; - - // Outputs - CeedScalar(*energy) = (CeedScalar(*))out[0]; - - // Context - const Physics context = (Physics)ctx; - const CeedScalar E = context->E; - const CeedScalar nu = context->nu; - const CeedScalar TwoMu = E / (1 + nu); - const CeedScalar mu = TwoMu / 2; - const CeedScalar Kbulk = E / (3 * (1 - 2 * nu)); // Bulk Modulus - const CeedScalar lambda = (3 * Kbulk - TwoMu) / 3; - - // Quadrature Point Loop - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { - // Read spatial derivatives of u - const CeedScalar du[3][3] = { - {ug[0][0][i], ug[1][0][i], ug[2][0][i]}, - {ug[0][1][i], ug[1][1][i], ug[2][1][i]}, - {ug[0][2][i], ug[1][2][i], ug[2][2][i]} - }; - // -- Qdata - const CeedScalar wdetJ = q_data[0][i]; - const CeedScalar dXdx[3][3] = { - {q_data[1][i], q_data[2][i], q_data[3][i]}, - {q_data[4][i], q_data[5][i], q_data[6][i]}, - {q_data[7][i], q_data[8][i], q_data[9][i]} - }; - - // Compute grad_u - // dXdx = (dx/dX)^(-1) - // Apply dXdx to du = grad_u - CeedScalar grad_u[3][3]; - for (int j = 0; j < 3; j++) { // Component - for (int k = 0; k < 3; k++) { // Derivative - grad_u[j][k] = 0; - for (int m = 0; m < 3; m++) grad_u[j][k] += dXdx[m][k] * du[j][m]; - } - } - - // E - Green-Lagrange strain tensor - // E = 1/2 (grad_u + grad_u^T + grad_u^T*grad_u) - const CeedInt indj[6] = {0, 1, 2, 1, 0, 0}, indk[6] = {0, 1, 2, 2, 2, 1}; - CeedScalar E2work[6]; - for (CeedInt m = 0; m < 6; m++) { - E2work[m] = grad_u[indj[m]][indk[m]] + grad_u[indk[m]][indj[m]]; - for (CeedInt n = 0; n < 3; n++) E2work[m] += grad_u[n][indj[m]] * grad_u[n][indk[m]]; - } - CeedScalar E2[3][3] = { - {E2work[0], E2work[5], E2work[4]}, - {E2work[5], E2work[1], E2work[3]}, - {E2work[4], E2work[3], E2work[2]} - }; - const CeedScalar Jm1 = computeJM1(grad_u); - const CeedScalar logJ = log1p_series_shifted(Jm1); - - // Strain energy Phi(E) for compressible Neo-Hookean - energy[i] = (lambda * logJ * logJ / 2. - mu * logJ + mu * (E2[0][0] + E2[1][1] + E2[2][2]) / 2.) * wdetJ; - - } // End of Quadrature Point Loop - - return 0; -} - -// ----------------------------------------------------------------------------- -// Nodal diagnostic quantities for hyperelasticity, finite strain -// ----------------------------------------------------------------------------- -CEED_QFUNCTION(ElasFSInitialNH2Diagnostic)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { - // Inputs - const CeedScalar(*u)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[0], (*ug)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[1], - (*q_data)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[2]; - - // Outputs - CeedScalar(*diagnostic)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[0]; - - // Context - const Physics context = (Physics)ctx; - const CeedScalar E = context->E; - const CeedScalar nu = context->nu; - const CeedScalar TwoMu = E / (1 + nu); - const CeedScalar mu = TwoMu / 2; - const CeedScalar Kbulk = E / (3 * (1 - 2 * nu)); // Bulk Modulus - const CeedScalar lambda = (3 * Kbulk - TwoMu) / 3; - - // Quadrature Point Loop - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { - // Read spatial derivatives of u - const CeedScalar du[3][3] = { - {ug[0][0][i], ug[1][0][i], ug[2][0][i]}, - {ug[0][1][i], ug[1][1][i], ug[2][1][i]}, - {ug[0][2][i], ug[1][2][i], ug[2][2][i]} - }; - // -- Qdata - const CeedScalar dXdx[3][3] = { - {q_data[1][i], q_data[2][i], q_data[3][i]}, - {q_data[4][i], q_data[5][i], q_data[6][i]}, - {q_data[7][i], q_data[8][i], q_data[9][i]} - }; - - // Compute grad_u - // dXdx = (dx/dX)^(-1) - // Apply dXdx to du = grad_u - CeedScalar grad_u[3][3]; - for (int j = 0; j < 3; j++) { // Component - for (int k = 0; k < 3; k++) { // Derivative - grad_u[j][k] = 0; - for (int m = 0; m < 3; m++) grad_u[j][k] += dXdx[m][k] * du[j][m]; - } - } - - // E - Green-Lagrange strain tensor - // E = 1/2 (grad_u + grad_u^T + grad_u^T*grad_u) - const CeedInt indj[6] = {0, 1, 2, 1, 0, 0}, indk[6] = {0, 1, 2, 2, 2, 1}; - CeedScalar E2work[6]; - for (CeedInt m = 0; m < 6; m++) { - E2work[m] = grad_u[indj[m]][indk[m]] + grad_u[indk[m]][indj[m]]; - for (CeedInt n = 0; n < 3; n++) E2work[m] += grad_u[n][indj[m]] * grad_u[n][indk[m]]; - } - CeedScalar E2[3][3] = { - {E2work[0], E2work[5], E2work[4]}, - {E2work[5], E2work[1], E2work[3]}, - {E2work[4], E2work[3], E2work[2]} - }; - - // Displacement - diagnostic[0][i] = u[0][i]; - diagnostic[1][i] = u[1][i]; - diagnostic[2][i] = u[2][i]; - - // Pressure - const CeedScalar Jm1 = computeJM1(grad_u); - const CeedScalar logJ = log1p_series_shifted(Jm1); - diagnostic[3][i] = -lambda * logJ; - - // Stress tensor invariants - diagnostic[4][i] = (E2[0][0] + E2[1][1] + E2[2][2]) / 2.; - diagnostic[5][i] = 0.; - for (CeedInt j = 0; j < 3; j++) { - for (CeedInt m = 0; m < 3; m++) diagnostic[5][i] += E2[j][m] * E2[m][j] / 4.; - } - diagnostic[6][i] = Jm1 + 1.; - - // Strain energy - diagnostic[7][i] = (lambda * logJ * logJ / 2. - mu * logJ + mu * (E2[0][0] + E2[1][1] + E2[2][2]) / 2.); - } // End of Quadrature Point Loop - - return 0; -} -// ----------------------------------------------------------------------------- diff --git a/examples/solids/qfunctions/finite-strain-neo-hookean-initial-1.h b/examples/solids/qfunctions/finite-strain-neo-hookean.h similarity index 96% rename from examples/solids/qfunctions/finite-strain-neo-hookean-initial-1.h rename to examples/solids/qfunctions/finite-strain-neo-hookean.h index 431c8e328a..5742c5e8ff 100644 --- a/examples/solids/qfunctions/finite-strain-neo-hookean-initial-1.h +++ b/examples/solids/qfunctions/finite-strain-neo-hookean.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -8,8 +8,10 @@ /// @file /// Hyperelasticity, finite strain for solid mechanics example using PETSc -#include +#include +#ifndef CEED_RUNNING_JIT_PASS #include +#endif #ifndef PHYSICS_STRUCT #define PHYSICS_STRUCT @@ -52,7 +54,7 @@ CEED_QFUNCTION_HELPER CeedScalar log1p_series_shifted(CeedScalar x) { y *= y2; sum += y / 7; return 2 * sum; -}; +} #endif // ----------------------------------------------------------------------------- @@ -66,7 +68,7 @@ CEED_QFUNCTION_HELPER CeedScalar computeJM1(const CeedScalar grad_u[3][3]) { grad_u[0][2] * (grad_u[1][0] * grad_u[2][1] - grad_u[2][0] * grad_u[1][1]) + grad_u[0][0] + grad_u[1][1] + grad_u[2][2] + grad_u[0][0] * grad_u[1][1] + grad_u[0][0] * grad_u[2][2] + grad_u[1][1] * grad_u[2][2] - grad_u[0][1] * grad_u[1][0] - grad_u[0][2] * grad_u[2][0] - grad_u[1][2] * grad_u[2][1]; -}; +} #endif // ----------------------------------------------------------------------------- @@ -86,8 +88,8 @@ CEED_QFUNCTION_HELPER int computeMatinvSym(const CeedScalar A[3][3], const CeedS }; for (CeedInt m = 0; m < 6; m++) Ainv[m] = B[m] / (detA); - return 0; -}; + return CEED_ERROR_SUCCESS; +} #endif // ----------------------------------------------------------------------------- @@ -136,13 +138,13 @@ CEED_QFUNCTION_HELPER int commonFS(const CeedScalar lambda, const CeedScalar mu, for (CeedInt n = 0; n < 3; n++) Swork[m] += mu * C_inv[indj[m]][n] * E2[n][indk[m]]; } - return 0; -}; + return CEED_ERROR_SUCCESS; +} // ----------------------------------------------------------------------------- // Residual evaluation for hyperelasticity, finite strain // ----------------------------------------------------------------------------- -CEED_QFUNCTION(ElasFSInitialNH1F)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { +CEED_QFUNCTION(ElasFSResidual_NH)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { // Inputs const CeedScalar(*ug)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[0], (*q_data)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[1]; @@ -242,13 +244,13 @@ CEED_QFUNCTION(ElasFSInitialNH1F)(void *ctx, CeedInt Q, const CeedScalar *const } } // End of Quadrature Point Loop - return 0; + return CEED_ERROR_SUCCESS; } // ----------------------------------------------------------------------------- // Jacobian evaluation for hyperelasticity, finite strain // ----------------------------------------------------------------------------- -CEED_QFUNCTION(ElasFSInitialNH1dF)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { +CEED_QFUNCTION(ElasFSJacobian_NH)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { // Inputs const CeedScalar(*deltaug)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[0], (*q_data)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[1]; @@ -387,13 +389,13 @@ CEED_QFUNCTION(ElasFSInitialNH1dF)(void *ctx, CeedInt Q, const CeedScalar *const } } // End of Quadrature Point Loop - return 0; + return CEED_ERROR_SUCCESS; } // ----------------------------------------------------------------------------- // Strain energy computation for hyperelasticity, finite strain // ----------------------------------------------------------------------------- -CEED_QFUNCTION(ElasFSInitialNH1Energy)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { +CEED_QFUNCTION(ElasFSEnergy_NH)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { // Inputs const CeedScalar(*ug)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[0], (*q_data)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[1]; @@ -457,13 +459,13 @@ CEED_QFUNCTION(ElasFSInitialNH1Energy)(void *ctx, CeedInt Q, const CeedScalar *c } // End of Quadrature Point Loop - return 0; + return CEED_ERROR_SUCCESS; } // ----------------------------------------------------------------------------- // Nodal diagnostic quantities for hyperelasticity, finite strain // ----------------------------------------------------------------------------- -CEED_QFUNCTION(ElasFSInitialNH1Diagnostic)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { +CEED_QFUNCTION(ElasFSDiagnostic_NH)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { // Inputs const CeedScalar(*u)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[0], (*ug)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[1], (*q_data)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[2]; @@ -542,6 +544,6 @@ CEED_QFUNCTION(ElasFSInitialNH1Diagnostic)(void *ctx, CeedInt Q, const CeedScala diagnostic[7][i] = (lambda * logJ * logJ / 2. - mu * logJ + mu * (E2[0][0] + E2[1][1] + E2[2][2]) / 2.); } // End of Quadrature Point Loop - return 0; + return CEED_ERROR_SUCCESS; } // ----------------------------------------------------------------------------- diff --git a/examples/solids/qfunctions/linear.h b/examples/solids/qfunctions/linear.h index b688fdf495..b6f9573c05 100644 --- a/examples/solids/qfunctions/linear.h +++ b/examples/solids/qfunctions/linear.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -8,8 +8,10 @@ /// @file /// Linear elasticity for solid mechanics example using PETSc -#include +#include +#ifndef CEED_RUNNING_JIT_PASS #include +#endif #ifndef PHYSICS_STRUCT #define PHYSICS_STRUCT @@ -23,7 +25,7 @@ struct Physics_private { // ----------------------------------------------------------------------------- // Residual evaluation for linear elasticity // ----------------------------------------------------------------------------- -CEED_QFUNCTION(ElasLinearF)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { +CEED_QFUNCTION(ElasResidual_Linear)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { // Inputs const CeedScalar(*ug)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[0], (*q_data)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[1]; @@ -113,13 +115,13 @@ CEED_QFUNCTION(ElasLinearF)(void *ctx, CeedInt Q, const CeedScalar *const *in, C } } // End of Quadrature Point Loop - return 0; + return CEED_ERROR_SUCCESS; } // ----------------------------------------------------------------------------- // Jacobian evaluation for linear elasticity // ----------------------------------------------------------------------------- -CEED_QFUNCTION(ElasLineardF)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { +CEED_QFUNCTION(ElasJacobian_Linear)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { // Inputs const CeedScalar(*deltaug)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[0], (*q_data)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[1]; @@ -208,13 +210,13 @@ CEED_QFUNCTION(ElasLineardF)(void *ctx, CeedInt Q, const CeedScalar *const *in, } } // End of Quadrature Point Loop - return 0; + return CEED_ERROR_SUCCESS; } // ----------------------------------------------------------------------------- // Strain energy computation for linear elasticity // ----------------------------------------------------------------------------- -CEED_QFUNCTION(ElasLinearEnergy)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { +CEED_QFUNCTION(ElasEnergy_Linear)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { // Inputs const CeedScalar(*ug)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[0], (*q_data)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[1]; @@ -275,13 +277,13 @@ CEED_QFUNCTION(ElasLinearEnergy)(void *ctx, CeedInt Q, const CeedScalar *const * } // End of Quadrature Point Loop - return 0; + return CEED_ERROR_SUCCESS; } // ----------------------------------------------------------------------------- // Nodal diagnostic quantities for linear elasticity // ----------------------------------------------------------------------------- -CEED_QFUNCTION(ElasLinearDiagnostic)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { +CEED_QFUNCTION(ElasDiagnostic_Linear)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { // Inputs const CeedScalar(*u)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[0], (*ug)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[1], (*q_data)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[2]; @@ -357,6 +359,6 @@ CEED_QFUNCTION(ElasLinearDiagnostic)(void *ctx, CeedInt Q, const CeedScalar *con (lambda * strain_vol * strain_vol / 2. + strain_vol * mu + (e[0][1] * e[0][1] + e[0][2] * e[0][2] + e[1][2] * e[1][2]) * 2 * mu); } // End of Quadrature Point Loop - return 0; + return CEED_ERROR_SUCCESS; } // ----------------------------------------------------------------------------- diff --git a/examples/solids/qfunctions/manufactured-force.h b/examples/solids/qfunctions/manufactured-force.h index 0764d103e3..41b761351f 100644 --- a/examples/solids/qfunctions/manufactured-force.h +++ b/examples/solids/qfunctions/manufactured-force.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -8,8 +8,10 @@ /// @file /// Linear elasticity manufactured solution forcing term for solid mechanics example using PETSc -#include +#include +#ifndef CEED_RUNNING_JIT_PASS #include +#endif #ifndef PHYSICS_STRUCT #define PHYSICS_STRUCT diff --git a/examples/solids/qfunctions/manufactured-true.h b/examples/solids/qfunctions/manufactured-true.h index 389fb6596f..25cffbd126 100644 --- a/examples/solids/qfunctions/manufactured-true.h +++ b/examples/solids/qfunctions/manufactured-true.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -8,8 +8,10 @@ /// @file /// Linear elasticity manufactured solution true solution for solid mechanics example using PETSc -#include +#include +#ifndef CEED_RUNNING_JIT_PASS #include +#endif // ----------------------------------------------------------------------------- // True solution for linear elasticity manufactured solution diff --git a/examples/solids/qfunctions/small-strain-neo-hookean.h b/examples/solids/qfunctions/small-strain-neo-hookean.h deleted file mode 100644 index 95e0afa66c..0000000000 --- a/examples/solids/qfunctions/small-strain-neo-hookean.h +++ /dev/null @@ -1,410 +0,0 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. -// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. -// -// SPDX-License-Identifier: BSD-2-Clause -// -// This file is part of CEED: http://github.com/ceed - -/// @file -/// Hyperelasticity, small strain for solid mechanics example using PETSc - -#include -#include - -#ifndef PHYSICS_STRUCT -#define PHYSICS_STRUCT -typedef struct Physics_private *Physics; -struct Physics_private { - CeedScalar nu; // Poisson's ratio - CeedScalar E; // Young's Modulus -}; -#endif - -// ----------------------------------------------------------------------------- -// Series approximation of log1p() -// log1p() is not vectorized in libc -// -// The series expansion is accurate to 1e-7 in the range sqrt(2)/2 < J < sqrt(2), with machine precision accuracy near J=1. -// ----------------------------------------------------------------------------- -#ifndef LOG1P_SERIES -#define LOG1P_SERIES -CEED_QFUNCTION_HELPER CeedScalar log1p_series(CeedScalar x) { - CeedScalar sum = 0; - CeedScalar y = x / (2. + x); - const CeedScalar y2 = y * y; - sum += y; - y *= y2; - sum += y / 3; - y *= y2; - sum += y / 5; - y *= y2; - sum += y / 7; - return 2 * sum; -}; -#endif - -// ----------------------------------------------------------------------------- -// Residual evaluation for hyperelasticity, small strain -// ----------------------------------------------------------------------------- -CEED_QFUNCTION(ElasSSNHF)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { - // Inputs - const CeedScalar(*ug)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[0], (*q_data)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[1]; - - // Outputs - CeedScalar(*dvdX)[3][CEED_Q_VLA] = (CeedScalar(*)[3][CEED_Q_VLA])out[0]; - // Store grad_u for HyperFSdF (Jacobian of HyperFSF) - CeedScalar(*grad_u)[3][CEED_Q_VLA] = (CeedScalar(*)[3][CEED_Q_VLA])out[1]; - - // Context - const Physics context = (Physics)ctx; - const CeedScalar E = context->E; - const CeedScalar nu = context->nu; - - // Constants - const CeedScalar TwoMu = E / (1 + nu); - const CeedScalar Kbulk = E / (3 * (1 - 2 * nu)); // Bulk modulus - const CeedScalar lambda = (3 * Kbulk - TwoMu) / 3; - - // Quadrature Point Loop - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { - // Read spatial derivatives of u - const CeedScalar du[3][3] = { - {ug[0][0][i], ug[1][0][i], ug[2][0][i]}, - {ug[0][1][i], ug[1][1][i], ug[2][1][i]}, - {ug[0][2][i], ug[1][2][i], ug[2][2][i]} - }; - // -- Qdata - const CeedScalar wdetJ = q_data[0][i]; - const CeedScalar dXdx[3][3] = { - {q_data[1][i], q_data[2][i], q_data[3][i]}, - {q_data[4][i], q_data[5][i], q_data[6][i]}, - {q_data[7][i], q_data[8][i], q_data[9][i]} - }; - - // Compute grad_u - // dXdx = (dx/dX)^(-1) - // Apply dXdx to du = grad_u - for (int j = 0; j < 3; j++) { // Component - for (int k = 0; k < 3; k++) { // Derivative - grad_u[j][k][i] = 0; - for (int m = 0; m < 3; m++) grad_u[j][k][i] += dXdx[m][k] * du[j][m]; - } - } - - // Compute Strain : e (epsilon) - // e = 1/2 (grad u + (grad u)^T) - const CeedScalar e00 = (grad_u[0][0][i] + grad_u[0][0][i]) / 2., e01 = (grad_u[0][1][i] + grad_u[1][0][i]) / 2., - e02 = (grad_u[0][2][i] + grad_u[2][0][i]) / 2., e11 = (grad_u[1][1][i] + grad_u[1][1][i]) / 2., - e12 = (grad_u[1][2][i] + grad_u[2][1][i]) / 2., e22 = (grad_u[2][2][i] + grad_u[2][2][i]) / 2.; - const CeedScalar e[3][3] = { - {e00, e01, e02}, - {e01, e11, e12}, - {e02, e12, e22} - }; - - // strain (epsilon) - // and - // stress (sigma) in Voigt notation: - // [e00] [sigma00] - // [e11] [sigma11] - // epsilon = [e22] , sigma = [sigma22] - // [e12] [sigma12] - // [e02] [sigma02] - // [e01] [sigma01] - // - // mu = E / (2 * (1 + nu)) - // bulk modulus = E / (2 * (1 - 2 * nu)) - // lambda = (3 * bulk modulus - 2 * mu) / 3 - // e_v = volumetric strain = e00 + e11 + e22 - // - // sigma = lambda * log(1 + e_v) + 2 * mu * epsilon - // - // Above Voigt Notation is placed in a 3x3 matrix: - // Volumetric strain - const CeedScalar strain_vol = e[0][0] + e[1][1] + e[2][2]; - const CeedScalar llv = log1p_series(strain_vol); - const CeedScalar sigma00 = lambda * llv + TwoMu * e[0][0], sigma11 = lambda * llv + TwoMu * e[1][1], sigma22 = lambda * llv + TwoMu * e[2][2], - sigma12 = TwoMu * e[1][2], sigma02 = TwoMu * e[0][2], sigma01 = TwoMu * e[0][1]; - const CeedScalar sigma[3][3] = { - {sigma00, sigma01, sigma02}, - {sigma01, sigma11, sigma12}, - {sigma02, sigma12, sigma22} - }; - - // Apply dXdx^T and weight to sigma - for (int j = 0; j < 3; j++) { // Component - for (int k = 0; k < 3; k++) { // Derivative - dvdX[k][j][i] = 0; - for (int m = 0; m < 3; m++) dvdX[k][j][i] += dXdx[k][m] * sigma[j][m] * wdetJ; - } - } - } // End of Quadrature Point Loop - - return 0; -} - -// ----------------------------------------------------------------------------- -// Jacobian evaluation for hyperelasticity, small strain -// ----------------------------------------------------------------------------- -CEED_QFUNCTION(ElasSSNHdF)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { - // Inputs - const CeedScalar(*deltaug)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[0], - (*q_data)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[1]; - // grad_u is used for hyperelasticity (non-linear) - const CeedScalar(*grad_u)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[2]; - - // Outputs - CeedScalar(*deltadvdX)[3][CEED_Q_VLA] = (CeedScalar(*)[3][CEED_Q_VLA])out[0]; - - // Context - const Physics context = (Physics)ctx; - const CeedScalar E = context->E; - const CeedScalar nu = context->nu; - - // Constants - const CeedScalar TwoMu = E / (1 + nu); - const CeedScalar Kbulk = E / (3 * (1 - 2 * nu)); // Bulk modulus - const CeedScalar lambda = (3 * Kbulk - TwoMu) / 3; - - // Quadrature Point Loop - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { - // Read spatial derivatives of u - const CeedScalar deltadu[3][3] = { - {deltaug[0][0][i], deltaug[1][0][i], deltaug[2][0][i]}, - {deltaug[0][1][i], deltaug[1][1][i], deltaug[2][1][i]}, - {deltaug[0][2][i], deltaug[1][2][i], deltaug[2][2][i]} - }; - // -- Qdata - const CeedScalar wdetJ = q_data[0][i]; - const CeedScalar dXdx[3][3] = { - {q_data[1][i], q_data[2][i], q_data[3][i]}, - {q_data[4][i], q_data[5][i], q_data[6][i]}, - {q_data[7][i], q_data[8][i], q_data[9][i]} - }; - - // Compute graddeltau - // Apply dXdx^-1 to deltadu = graddeltau - CeedScalar graddeltau[3][3]; - for (int j = 0; j < 3; j++) { // Component - for (int k = 0; k < 3; k++) { // Derivative - graddeltau[j][k] = 0; - for (int m = 0; m < 3; m++) graddeltau[j][k] += dXdx[m][k] * deltadu[j][m]; - } - } - - // Compute Strain : e (epsilon) - // e = 1/2 (grad u + (grad u)^T) - const CeedScalar de00 = (graddeltau[0][0] + graddeltau[0][0]) / 2., de01 = (graddeltau[0][1] + graddeltau[1][0]) / 2., - de02 = (graddeltau[0][2] + graddeltau[2][0]) / 2., de11 = (graddeltau[1][1] + graddeltau[1][1]) / 2., - de12 = (graddeltau[1][2] + graddeltau[2][1]) / 2., de22 = (graddeltau[2][2] + graddeltau[2][2]) / 2.; - const CeedScalar de[3][3] = { - {de00, de01, de02}, - {de01, de11, de12}, - {de02, de12, de22} - }; - - // strain (epsilon) - // and - // stress (sigma) in Voigt notation: - // [e00] [sigma00] - // [e11] [sigma11] - // depsilon = [e22] , dsigma = [sigma22] - // [e12] [sigma12] - // [e02] [sigma02] - // [e01] [sigma01] - // - // mu = E / (2 * (1 + nu)) - // bulk modulus = E / (2 * (1 - 2 * nu)) - // lambda = (3 * bulk modulus - 2 * mu) / 3 - // e_v = volumetric strain = e00 + e11 + e22 - // lambda bar = lambda / (1 + e_v) - // - // dSigma = S * epsilon - // - // S_ijkl = lambda bar * delta_ij * delta_kl + 2 * mu * delta_ik * delta_jl - // - // Matrix form: - // - // [2 mu + lambda bar lambda bar lambda bar ] - // [ lambda bar 2 mu + lambda bar lambda bar ] - // [ lambda bar lambda bar 2 mu + lambda bar ] - // S = [ mu ] - // [ mu ] - // [ mu ] - // - // Above Voigt Notation is placed in a 3x3 matrix: - const CeedScalar strain_vol = grad_u[0][0][i] + grad_u[1][1][i] + grad_u[2][2][i]; - const CeedScalar lambda_bar = lambda / (1 + strain_vol); - const CeedScalar lambda_dtrace = lambda_bar * (de[0][0] + de[1][1] + de[2][2]); - const CeedScalar dsigma00 = lambda_dtrace + TwoMu * de[0][0], dsigma11 = lambda_dtrace + TwoMu * de[1][1], - dsigma22 = lambda_dtrace + TwoMu * de[2][2], dsigma12 = TwoMu * de[1][2], dsigma02 = TwoMu * de[0][2], - dsigma01 = TwoMu * de[0][1]; - const CeedScalar dsigma[3][3] = { - {dsigma00, dsigma01, dsigma02}, - {dsigma01, dsigma11, dsigma12}, - {dsigma02, dsigma12, dsigma22} - }; - - // Apply dXdx^-T and weight - for (int j = 0; j < 3; j++) { // Component - for (int k = 0; k < 3; k++) { // Derivative - deltadvdX[k][j][i] = 0; - for (int m = 0; m < 3; m++) deltadvdX[k][j][i] += dXdx[k][m] * dsigma[j][m] * wdetJ; - } - } - } // End of Quadrature Point Loop - - return 0; -} - -// ----------------------------------------------------------------------------- -// Strain energy computation for hyperelasticity, small strain -// ----------------------------------------------------------------------------- -CEED_QFUNCTION(ElasSSNHEnergy)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { - // Inputs - const CeedScalar(*ug)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[0], (*q_data)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[1]; - - // Outputs - CeedScalar(*energy) = (CeedScalar(*))out[0]; - - // Context - const Physics context = (Physics)ctx; - const CeedScalar E = context->E; - const CeedScalar nu = context->nu; - - // Constants - const CeedScalar TwoMu = E / (1 + nu); - const CeedScalar mu = TwoMu / 2; - const CeedScalar Kbulk = E / (3 * (1 - 2 * nu)); // Bulk Modulus - const CeedScalar lambda = (3 * Kbulk - TwoMu) / 3; - - // Quadrature Point Loop - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { - // Read spatial derivatives of u - const CeedScalar du[3][3] = { - {ug[0][0][i], ug[1][0][i], ug[2][0][i]}, - {ug[0][1][i], ug[1][1][i], ug[2][1][i]}, - {ug[0][2][i], ug[1][2][i], ug[2][2][i]} - }; - // -- Qdata - const CeedScalar wdetJ = q_data[0][i]; - const CeedScalar dXdx[3][3] = { - {q_data[1][i], q_data[2][i], q_data[3][i]}, - {q_data[4][i], q_data[5][i], q_data[6][i]}, - {q_data[7][i], q_data[8][i], q_data[9][i]} - }; - - // Compute grad_u - // dXdx = (dx/dX)^(-1) - // Apply dXdx to du = grad_u - CeedScalar grad_u[3][3]; - for (CeedInt j = 0; j < 3; j++) { // Component - for (CeedInt k = 0; k < 3; k++) { // Derivative - grad_u[j][k] = 0; - for (CeedInt m = 0; m < 3; m++) grad_u[j][k] += dXdx[m][k] * du[j][m]; - } - } - - // Compute Strain : e (epsilon) - // e = 1/2 (grad u + (grad u)^T) - - const CeedScalar e[3][3] = { - {(grad_u[0][0] + grad_u[0][0]) / 2., (grad_u[0][1] + grad_u[1][0]) / 2., (grad_u[0][2] + grad_u[2][0]) / 2.}, - {(grad_u[1][0] + grad_u[0][1]) / 2., (grad_u[1][1] + grad_u[1][1]) / 2., (grad_u[1][2] + grad_u[2][1]) / 2.}, - {(grad_u[2][0] + grad_u[0][2]) / 2., (grad_u[2][1] + grad_u[1][2]) / 2., (grad_u[2][2] + grad_u[2][2]) / 2.} - }; - - // Strain Energy - const CeedScalar strain_vol = e[0][0] + e[1][1] + e[2][2]; - const CeedScalar llv = log1p_series(strain_vol); - energy[i] = - (lambda * (1 + strain_vol) * (llv - 1) + strain_vol * mu + (e[0][1] * e[0][1] + e[0][2] * e[0][2] + e[1][2] * e[1][2]) * 2 * mu) * wdetJ; - - } // End of Quadrature Point Loop - - return 0; -} - -// ----------------------------------------------------------------------------- -// Nodal diagnostic quantities for hyperelasticity, small strain -// ----------------------------------------------------------------------------- -CEED_QFUNCTION(ElasSSNHDiagnostic)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { - // Inputs - const CeedScalar(*u)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[0], (*ug)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[1], - (*q_data)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[2]; - - // Outputs - CeedScalar(*diagnostic)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[0]; - - // Context - const Physics context = (Physics)ctx; - const CeedScalar E = context->E; - const CeedScalar nu = context->nu; - - // Constants - const CeedScalar TwoMu = E / (1 + nu); - const CeedScalar mu = TwoMu / 2; - const CeedScalar Kbulk = E / (3 * (1 - 2 * nu)); // Bulk Modulus - const CeedScalar lambda = (3 * Kbulk - TwoMu) / 3; - - // Quadrature Point Loop - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { - // Read spatial derivatives of u - const CeedScalar du[3][3] = { - {ug[0][0][i], ug[1][0][i], ug[2][0][i]}, - {ug[0][1][i], ug[1][1][i], ug[2][1][i]}, - {ug[0][2][i], ug[1][2][i], ug[2][2][i]} - }; - // -- Qdata - const CeedScalar dXdx[3][3] = { - {q_data[1][i], q_data[2][i], q_data[3][i]}, - {q_data[4][i], q_data[5][i], q_data[6][i]}, - {q_data[7][i], q_data[8][i], q_data[9][i]} - }; - - // Compute grad_u - // dXdx = (dx/dX)^(-1) - // Apply dXdx to du = grad_u - CeedScalar grad_u[3][3]; - for (CeedInt j = 0; j < 3; j++) { // Component - for (CeedInt k = 0; k < 3; k++) { // Derivative - grad_u[j][k] = 0; - for (CeedInt m = 0; m < 3; m++) grad_u[j][k] += dXdx[m][k] * du[j][m]; - } - } - - // Compute Strain : e (epsilon) - // e = 1/2 (grad u + (grad u)^T) - - const CeedScalar e[3][3] = { - {(grad_u[0][0] + grad_u[0][0]) / 2., (grad_u[0][1] + grad_u[1][0]) / 2., (grad_u[0][2] + grad_u[2][0]) / 2.}, - {(grad_u[1][0] + grad_u[0][1]) / 2., (grad_u[1][1] + grad_u[1][1]) / 2., (grad_u[1][2] + grad_u[2][1]) / 2.}, - {(grad_u[2][0] + grad_u[0][2]) / 2., (grad_u[2][1] + grad_u[1][2]) / 2., (grad_u[2][2] + grad_u[2][2]) / 2.} - }; - - // Displacement - diagnostic[0][i] = u[0][i]; - diagnostic[1][i] = u[1][i]; - diagnostic[2][i] = u[2][i]; - - // Pressure - const CeedScalar strain_vol = e[0][0] + e[1][1] + e[2][2]; - const CeedScalar llv = log1p_series(strain_vol); - diagnostic[3][i] = -lambda * llv; - - // Stress tensor invariants - diagnostic[4][i] = strain_vol; - diagnostic[5][i] = 0.; - for (CeedInt j = 0; j < 3; j++) { - for (CeedInt m = 0; m < 3; m++) diagnostic[5][i] += e[j][m] * e[m][j]; - } - diagnostic[6][i] = 1 + strain_vol; - - // Strain energy - diagnostic[7][i] = - (lambda * (1 + strain_vol) * (llv - 1) + strain_vol * mu + (e[0][1] * e[0][1] + e[0][2] * e[0][2] + e[1][2] * e[1][2]) * 2 * mu); - } // End of Quadrature Point Loop - - return 0; -} -// ----------------------------------------------------------------------------- diff --git a/examples/solids/qfunctions/traction-boundary.h b/examples/solids/qfunctions/traction-boundary.h index 181b176d0a..6cc3c2e16d 100644 --- a/examples/solids/qfunctions/traction-boundary.h +++ b/examples/solids/qfunctions/traction-boundary.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -8,7 +8,7 @@ /// @file /// Geometric factors for solid mechanics example using PETSc -#include +#include // ----------------------------------------------------------------------------- // This QFunction computes the surface integral of the user traction vector on the constrained faces. diff --git a/examples/solids/src/boundary.c b/examples/solids/src/boundary.c index 2985d0d21a..2fdaac80ea 100644 --- a/examples/solids/src/boundary.c +++ b/examples/solids/src/boundary.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause diff --git a/examples/solids/src/cl-options.c b/examples/solids/src/cl-options.c index 935ee6c9b8..3dc3d7effb 100644 --- a/examples/solids/src/cl-options.c +++ b/examples/solids/src/cl-options.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -57,10 +57,7 @@ PetscErrorCode ProcessCommandLineOptions(MPI_Comm comm, AppCtx app_ctx) { app_ctx->forcing_vector[2] = 0; PetscCall(PetscOptionsScalarArray("-forcing_vec", "Direction to apply constant force", NULL, app_ctx->forcing_vector, &max_n, NULL)); - if ((app_ctx->problem_choice == ELAS_FSInitial_NH1 || app_ctx->problem_choice == ELAS_FSInitial_NH2 || - app_ctx->problem_choice == ELAS_FSCurrent_NH1 || app_ctx->problem_choice == ELAS_FSCurrent_NH2 || - app_ctx->problem_choice == ELAS_FSInitial_MR1) && - app_ctx->forcing_choice == FORCE_CONST) { + if ((app_ctx->problem_choice == ELAS_FS_NH || app_ctx->problem_choice == ELAS_FS_MR) && app_ctx->forcing_choice == FORCE_CONST) { SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Cannot use constant forcing and finite strain formulation. " "Constant forcing in reference frame currently unavailable."); @@ -68,8 +65,8 @@ PetscErrorCode ProcessCommandLineOptions(MPI_Comm comm, AppCtx app_ctx) { // Dirichlet boundary conditions app_ctx->bc_clamp_count = 16; - PetscCall( - PetscOptionsIntArray("-bc_clamp", "Face IDs to apply incremental Dirichlet BC", NULL, app_ctx->bc_clamp_faces, &app_ctx->bc_clamp_count, NULL)); + PetscCall(PetscOptionsIntArray("-bc_clamp", "Face IDs to apply incremental Dirichlet BC", NULL, app_ctx->bc_clamp_faces, &app_ctx->bc_clamp_count, + NULL)); // Set vector for each clamped BC for (PetscInt i = 0; i < app_ctx->bc_clamp_count; i++) { // Translation vector diff --git a/examples/solids/src/matops.c b/examples/solids/src/matops.c index bef9960fb5..31930d2446 100644 --- a/examples/solids/src/matops.c +++ b/examples/solids/src/matops.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause diff --git a/examples/solids/src/misc.c b/examples/solids/src/misc.c index d4f1986473..6c45e893b6 100644 --- a/examples/solids/src/misc.c +++ b/examples/solids/src/misc.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause diff --git a/examples/solids/src/setup-dm.c b/examples/solids/src/setup-dm.c index e70b4738dc..9a4d55a356 100644 --- a/examples/solids/src/setup-dm.c +++ b/examples/solids/src/setup-dm.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -43,7 +43,7 @@ PetscErrorCode CreateDistributedDM(MPI_Comm comm, AppCtx app_ctx, DM *dm) { PetscInt dim = 3, faces[3] = {3, 3, 3}; PetscCall(PetscOptionsGetIntArray(NULL, NULL, "-dm_plex_box_faces", faces, &dim, NULL)); if (!dim) dim = 3; - PetscCall(DMPlexCreateBoxMesh(comm, dim, PETSC_FALSE, faces, NULL, NULL, NULL, interpolate, dm)); + PetscCall(DMPlexCreateBoxMesh(comm, dim, PETSC_FALSE, faces, NULL, NULL, NULL, interpolate, 0, PETSC_FALSE, dm)); } else { PetscCall(DMPlexCreateFromFile(comm, filename, NULL, interpolate, dm)); } diff --git a/examples/solids/src/setup-libceed.c b/examples/solids/src/setup-libceed.c index 608278ec5c..bfe153fcbf 100644 --- a/examples/solids/src/setup-libceed.c +++ b/examples/solids/src/setup-libceed.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -20,11 +20,6 @@ #include "../qfunctions/manufactured-force.h" // Manufactured solution forcing #include "../qfunctions/traction-boundary.h" // Traction boundaries -#if PETSC_VERSION_LT(3, 14, 0) -#define DMPlexGetClosureIndices(a, b, c, d, e, f, g, h, i) DMPlexGetClosureIndices(a, b, c, d, f, g, i) -#define DMPlexRestoreClosureIndices(a, b, c, d, e, f, g, h, i) DMPlexRestoreClosureIndices(a, b, c, d, f, g, i) -#endif - // ----------------------------------------------------------------------------- // Problem options // ----------------------------------------------------------------------------- @@ -321,8 +316,8 @@ PetscErrorCode SetupLibceedFineLevel(DM dm, DM dm_energy, DM dm_diagnostic, Ceed CeedOperator op_traction; CeedQFunctionContextSetData(traction_ctx, CEED_MEM_HOST, CEED_USE_POINTER, 3 * sizeof(CeedScalar), app_ctx->bc_traction_vector[i]); // Setup restriction - PetscCall( - GetRestrictionForDomain(ceed, dm, 1, domain_label, app_ctx->bc_traction_faces[i], Q, 0, &elem_restr_u_face, &elem_restr_x_face, NULL)); + PetscCall(GetRestrictionForDomain(ceed, dm, 1, domain_label, app_ctx->bc_traction_faces[i], Q, 0, &elem_restr_u_face, &elem_restr_x_face, + NULL)); // ---- Create boundary Operator CeedOperatorCreate(ceed, qf_traction, NULL, NULL, &op_traction); CeedOperatorSetField(op_traction, "dx", elem_restr_x_face, basis_x_face, CEED_VECTOR_ACTIVE); diff --git a/gallery/ceed-gallery-list.h b/gallery/ceed-gallery-list.h index 4fa8a08227..c1829eef64 100644 --- a/gallery/ceed-gallery-list.h +++ b/gallery/ceed-gallery-list.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -13,6 +13,7 @@ // At the time of this writing, all the gallery functions are defined, but we're adopting the same strategy here as for the backends because future gallery @ref CeedQFunction might depend on external libraries. CEED_GALLERY_QFUNCTION(CeedQFunctionRegister_Identity) +CEED_GALLERY_QFUNCTION(CeedQFunctionRegister_IdentityScalar) CEED_GALLERY_QFUNCTION(CeedQFunctionRegister_Mass1DBuild) CEED_GALLERY_QFUNCTION(CeedQFunctionRegister_Mass2DBuild) CEED_GALLERY_QFUNCTION(CeedQFunctionRegister_Mass3DBuild) @@ -28,3 +29,4 @@ CEED_GALLERY_QFUNCTION(CeedQFunctionRegister_Vector3Poisson1DApply) CEED_GALLERY_QFUNCTION(CeedQFunctionRegister_Vector3Poisson2DApply) CEED_GALLERY_QFUNCTION(CeedQFunctionRegister_Vector3Poisson3DApply) CEED_GALLERY_QFUNCTION(CeedQFunctionRegister_Scale) +CEED_GALLERY_QFUNCTION(CeedQFunctionRegister_ScaleScalar) diff --git a/gallery/ceed-gallery-weak.c b/gallery/ceed-gallery-weak.c deleted file mode 100644 index bb983b9a56..0000000000 --- a/gallery/ceed-gallery-weak.c +++ /dev/null @@ -1,23 +0,0 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. -// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. -// -// SPDX-License-Identifier: BSD-2-Clause -// -// This file is part of CEED: http://github.com/ceed - -#include -#include - -// This function provides a debug target for weak symbols -// LCOV_EXCL_START -static int CeedQFunctionRegister_Weak(const char *name) { - CeedDebugEnv("** Weak Register: %s", name); - return CEED_ERROR_SUCCESS; -} -// LCOV_EXCL_STOP - -#define CEED_GALLERY_QFUNCTION(name) \ - CEED_INTERN int name(void) __attribute__((weak)); \ - int name(void) { return CeedQFunctionRegister_Weak(__func__); } -#include "ceed-gallery-list.h" -#undef CEED_GALLERY_QFUNCTION diff --git a/gallery/identity/ceed-identity-to-scalar.c b/gallery/identity/ceed-identity-to-scalar.c new file mode 100644 index 0000000000..403fcbafe5 --- /dev/null +++ b/gallery/identity/ceed-identity-to-scalar.c @@ -0,0 +1,34 @@ +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. +// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +// +// SPDX-License-Identifier: BSD-2-Clause +// +// This file is part of CEED: http://github.com/ceed + +#include +#include +#include +#include +#include + +/** + @brief Set fields identity `CeedQFunction` that copies first input component directly into output +**/ +static int CeedQFunctionInit_IdentityScalar(Ceed ceed, const char *requested, CeedQFunction qf) { + // Check QFunction name + const char *name = "Identity to scalar"; + + CeedCheck(!strcmp(name, requested), ceed, CEED_ERROR_UNSUPPORTED, "QFunction '%s' does not match requested name: %s", name, requested); + + // QFunction fields 'input' and 'output' with requested emodes added by the library rather than being added here + + CeedCall(CeedQFunctionSetUserFlopsEstimate(qf, 0)); + return CEED_ERROR_SUCCESS; +} + +/** + @brief Register identity `CeedQFunction` that copies first input component directly into output +**/ +CEED_INTERN int CeedQFunctionRegister_IdentityScalar(void) { + return CeedQFunctionRegister("Identity to scalar", IdentityScalar_loc, 1, IdentityScalar, CeedQFunctionInit_IdentityScalar); +} diff --git a/gallery/identity/ceed-identity.c b/gallery/identity/ceed-identity.c index aa7f59eda4..415d19a274 100644 --- a/gallery/identity/ceed-identity.c +++ b/gallery/identity/ceed-identity.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -17,21 +17,22 @@ static int CeedQFunctionInit_Identity(Ceed ceed, const char *requested, CeedQFunction qf) { // Check QFunction name const char *name = "Identity"; + CeedCheck(!strcmp(name, requested), ceed, CEED_ERROR_UNSUPPORTED, "QFunction '%s' does not match requested name: %s", name, requested); // QFunction fields 'input' and 'output' with requested emodes added by the library rather than being added here - CeedCall(CeedQFunctionSetUserFlopsEstimate(qf, 0)); - // Context data CeedQFunctionContext ctx; IdentityCtx ctx_data = {.size = 1}; + CeedCall(CeedQFunctionContextCreate(ceed, &ctx)); CeedCall(CeedQFunctionContextSetData(ctx, CEED_MEM_HOST, CEED_COPY_VALUES, sizeof(ctx_data), &ctx_data)); CeedCall(CeedQFunctionContextRegisterInt32(ctx, "size", offsetof(IdentityCtx, size), 1, "field size of identity QFunction")); CeedCall(CeedQFunctionSetContext(qf, ctx)); CeedCall(CeedQFunctionContextDestroy(&ctx)); + CeedCall(CeedQFunctionSetUserFlopsEstimate(qf, 0)); return CEED_ERROR_SUCCESS; } diff --git a/gallery/mass-vector/ceed-vectormassapply.c b/gallery/mass-vector/ceed-vectormassapply.c index c47d77fe48..bae789a0c5 100644 --- a/gallery/mass-vector/ceed-vectormassapply.c +++ b/gallery/mass-vector/ceed-vectormassapply.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -16,16 +16,17 @@ static int CeedQFunctionInit_Vector3MassApply(Ceed ceed, const char *requested, CeedQFunction qf) { // Check QFunction name const char *name = "Vector3MassApply"; + CeedCheck(!strcmp(name, requested), ceed, CEED_ERROR_UNSUPPORTED, "QFunction '%s' does not match requested name: %s", name, requested); // Add QFunction fields const CeedInt num_comp = 3; + CeedCall(CeedQFunctionAddInput(qf, "u", num_comp, CEED_EVAL_INTERP)); CeedCall(CeedQFunctionAddInput(qf, "qdata", 1, CEED_EVAL_NONE)); CeedCall(CeedQFunctionAddOutput(qf, "v", num_comp, CEED_EVAL_INTERP)); CeedCall(CeedQFunctionSetUserFlopsEstimate(qf, num_comp)); - return CEED_ERROR_SUCCESS; } diff --git a/gallery/mass/ceed-mass1dbuild.c b/gallery/mass/ceed-mass1dbuild.c index db4454f7b6..7931ad0c36 100644 --- a/gallery/mass/ceed-mass1dbuild.c +++ b/gallery/mass/ceed-mass1dbuild.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -16,16 +16,17 @@ static int CeedQFunctionInit_Mass1DBuild(Ceed ceed, const char *requested, CeedQFunction qf) { // Check QFunction name const char *name = "Mass1DBuild"; + CeedCheck(!strcmp(name, requested), ceed, CEED_ERROR_UNSUPPORTED, "QFunction '%s' does not match requested name: %s", name, requested); // Add QFunction fields const CeedInt dim = 1; + CeedCall(CeedQFunctionAddInput(qf, "dx", dim * dim, CEED_EVAL_GRAD)); CeedCall(CeedQFunctionAddInput(qf, "weights", 1, CEED_EVAL_WEIGHT)); CeedCall(CeedQFunctionAddOutput(qf, "qdata", 1, CEED_EVAL_NONE)); CeedCall(CeedQFunctionSetUserFlopsEstimate(qf, 1)); - return CEED_ERROR_SUCCESS; } diff --git a/gallery/mass/ceed-mass2dbuild.c b/gallery/mass/ceed-mass2dbuild.c index 52e10dec27..961ddbf2e9 100644 --- a/gallery/mass/ceed-mass2dbuild.c +++ b/gallery/mass/ceed-mass2dbuild.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -16,16 +16,17 @@ static int CeedQFunctionInit_Mass2DBuild(Ceed ceed, const char *requested, CeedQFunction qf) { // Check QFunction name const char *name = "Mass2DBuild"; + CeedCheck(!strcmp(name, requested), ceed, CEED_ERROR_UNSUPPORTED, "QFunction '%s' does not match requested name: %s", name, requested); // Add QFunction fields const CeedInt dim = 2; + CeedCall(CeedQFunctionAddInput(qf, "dx", dim * dim, CEED_EVAL_GRAD)); CeedCall(CeedQFunctionAddInput(qf, "weights", 1, CEED_EVAL_WEIGHT)); CeedCall(CeedQFunctionAddOutput(qf, "qdata", 1, CEED_EVAL_NONE)); CeedCall(CeedQFunctionSetUserFlopsEstimate(qf, 4)); - return CEED_ERROR_SUCCESS; } diff --git a/gallery/mass/ceed-mass3dbuild.c b/gallery/mass/ceed-mass3dbuild.c index fcb3ab23f3..e4edf2dd85 100644 --- a/gallery/mass/ceed-mass3dbuild.c +++ b/gallery/mass/ceed-mass3dbuild.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -16,16 +16,17 @@ static int CeedQFunctionInit_Mass3DBuild(Ceed ceed, const char *requested, CeedQFunction qf) { // Check QFunction name const char *name = "Mass3DBuild"; + CeedCheck(!strcmp(name, requested), ceed, CEED_ERROR_UNSUPPORTED, "QFunction '%s' does not match requested name: %s", name, requested); // Add QFunction fields const CeedInt dim = 3; + CeedCall(CeedQFunctionAddInput(qf, "dx", dim * dim, CEED_EVAL_GRAD)); CeedCall(CeedQFunctionAddInput(qf, "weights", 1, CEED_EVAL_WEIGHT)); CeedCall(CeedQFunctionAddOutput(qf, "qdata", 1, CEED_EVAL_NONE)); CeedCall(CeedQFunctionSetUserFlopsEstimate(qf, 15)); - return CEED_ERROR_SUCCESS; } diff --git a/gallery/mass/ceed-massapply.c b/gallery/mass/ceed-massapply.c index d213a7a359..11c19aa799 100644 --- a/gallery/mass/ceed-massapply.c +++ b/gallery/mass/ceed-massapply.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -16,6 +16,7 @@ static int CeedQFunctionInit_MassApply(Ceed ceed, const char *requested, CeedQFunction qf) { // Check QFunction name const char *name = "MassApply"; + CeedCheck(!strcmp(name, requested), ceed, CEED_ERROR_UNSUPPORTED, "QFunction '%s' does not match requested name: %s", name, requested); // Add QFunction fields @@ -24,7 +25,6 @@ static int CeedQFunctionInit_MassApply(Ceed ceed, const char *requested, CeedQFu CeedCall(CeedQFunctionAddOutput(qf, "v", 1, CEED_EVAL_INTERP)); CeedCall(CeedQFunctionSetUserFlopsEstimate(qf, 1)); - return CEED_ERROR_SUCCESS; } diff --git a/gallery/poisson-vector/ceed-vectorpoisson1dapply.c b/gallery/poisson-vector/ceed-vectorpoisson1dapply.c index 2e8578d5a3..d49026a97d 100644 --- a/gallery/poisson-vector/ceed-vectorpoisson1dapply.c +++ b/gallery/poisson-vector/ceed-vectorpoisson1dapply.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -16,16 +16,17 @@ static int CeedQFunctionInit_Vector3Poisson1DApply(Ceed ceed, const char *requested, CeedQFunction qf) { // Check QFunction name const char *name = "Vector3Poisson1DApply"; + CeedCheck(!strcmp(name, requested), ceed, CEED_ERROR_UNSUPPORTED, "QFunction '%s' does not match requested name: %s", name, requested); // Add QFunction fields const CeedInt dim = 1, num_comp = 3; + CeedCall(CeedQFunctionAddInput(qf, "du", num_comp * dim, CEED_EVAL_GRAD)); CeedCall(CeedQFunctionAddInput(qf, "qdata", dim * (dim + 1) / 2, CEED_EVAL_NONE)); CeedCall(CeedQFunctionAddOutput(qf, "dv", num_comp * dim, CEED_EVAL_GRAD)); CeedCall(CeedQFunctionSetUserFlopsEstimate(qf, num_comp)); - return CEED_ERROR_SUCCESS; } diff --git a/gallery/poisson-vector/ceed-vectorpoisson2dapply.c b/gallery/poisson-vector/ceed-vectorpoisson2dapply.c index 8eb96609ff..7e4031f477 100644 --- a/gallery/poisson-vector/ceed-vectorpoisson2dapply.c +++ b/gallery/poisson-vector/ceed-vectorpoisson2dapply.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -16,16 +16,17 @@ static int CeedQFunctionInit_Vector3Poisson2DApply(Ceed ceed, const char *requested, CeedQFunction qf) { // Check QFunction name const char *name = "Vector3Poisson2DApply"; + CeedCheck(!strcmp(name, requested), ceed, CEED_ERROR_UNSUPPORTED, "QFunction '%s' does not match requested name: %s", name, requested); // Add QFunction fields const CeedInt dim = 2, num_comp = 3; + CeedCall(CeedQFunctionAddInput(qf, "du", num_comp * dim, CEED_EVAL_GRAD)); CeedCall(CeedQFunctionAddInput(qf, "qdata", dim * (dim + 1) / 2, CEED_EVAL_NONE)); CeedCall(CeedQFunctionAddOutput(qf, "dv", num_comp * dim, CEED_EVAL_GRAD)); CeedCall(CeedQFunctionSetUserFlopsEstimate(qf, num_comp * 6)); - return CEED_ERROR_SUCCESS; } diff --git a/gallery/poisson-vector/ceed-vectorpoisson3dapply.c b/gallery/poisson-vector/ceed-vectorpoisson3dapply.c index 2506db2b45..9e1864287f 100644 --- a/gallery/poisson-vector/ceed-vectorpoisson3dapply.c +++ b/gallery/poisson-vector/ceed-vectorpoisson3dapply.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -16,16 +16,17 @@ static int CeedQFunctionInit_Vector3Poisson3DApply(Ceed ceed, const char *requested, CeedQFunction qf) { // Check QFunction name const char *name = "Vector3Poisson3DApply"; + CeedCheck(!strcmp(name, requested), ceed, CEED_ERROR_UNSUPPORTED, "QFunction '%s' does not match requested name: %s", name, requested); // Add QFunction fields const CeedInt dim = 3, num_comp = 3; + CeedCall(CeedQFunctionAddInput(qf, "du", num_comp * dim, CEED_EVAL_GRAD)); CeedCall(CeedQFunctionAddInput(qf, "qdata", dim * (dim + 1) / 2, CEED_EVAL_NONE)); CeedCall(CeedQFunctionAddOutput(qf, "dv", num_comp * dim, CEED_EVAL_GRAD)); CeedCall(CeedQFunctionSetUserFlopsEstimate(qf, num_comp * 15)); - return CEED_ERROR_SUCCESS; } diff --git a/gallery/poisson/ceed-poisson1dapply.c b/gallery/poisson/ceed-poisson1dapply.c index a9b6cef825..b007a60092 100644 --- a/gallery/poisson/ceed-poisson1dapply.c +++ b/gallery/poisson/ceed-poisson1dapply.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -16,16 +16,17 @@ static int CeedQFunctionInit_Poisson1DApply(Ceed ceed, const char *requested, CeedQFunction qf) { // Check QFunction name const char *name = "Poisson1DApply"; + CeedCheck(!strcmp(name, requested), ceed, CEED_ERROR_UNSUPPORTED, "QFunction '%s' does not match requested name: %s", name, requested); // Add QFunction fields const CeedInt dim = 1; + CeedCall(CeedQFunctionAddInput(qf, "du", dim, CEED_EVAL_GRAD)); CeedCall(CeedQFunctionAddInput(qf, "qdata", dim * (dim + 1) / 2, CEED_EVAL_NONE)); CeedCall(CeedQFunctionAddOutput(qf, "dv", dim, CEED_EVAL_GRAD)); CeedCall(CeedQFunctionSetUserFlopsEstimate(qf, 1)); - return CEED_ERROR_SUCCESS; } diff --git a/gallery/poisson/ceed-poisson1dbuild.c b/gallery/poisson/ceed-poisson1dbuild.c index 69f4e1fb50..cd8075a0e8 100644 --- a/gallery/poisson/ceed-poisson1dbuild.c +++ b/gallery/poisson/ceed-poisson1dbuild.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -16,16 +16,17 @@ static int CeedQFunctionInit_Poisson1DBuild(Ceed ceed, const char *requested, CeedQFunction qf) { // Check QFunction name const char *name = "Poisson1DBuild"; + CeedCheck(!strcmp(name, requested), ceed, CEED_ERROR_UNSUPPORTED, "QFunction '%s' does not match requested name: %s", name, requested); // Add QFunction fields const CeedInt dim = 1; + CeedCall(CeedQFunctionAddInput(qf, "dx", dim * dim, CEED_EVAL_GRAD)); CeedCall(CeedQFunctionAddInput(qf, "weights", 1, CEED_EVAL_WEIGHT)); CeedCall(CeedQFunctionAddOutput(qf, "qdata", dim * (dim + 1) / 2, CEED_EVAL_NONE)); CeedCall(CeedQFunctionSetUserFlopsEstimate(qf, 1)); - return CEED_ERROR_SUCCESS; } diff --git a/gallery/poisson/ceed-poisson2dapply.c b/gallery/poisson/ceed-poisson2dapply.c index 5eb2d058bb..d055386dfe 100644 --- a/gallery/poisson/ceed-poisson2dapply.c +++ b/gallery/poisson/ceed-poisson2dapply.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -16,16 +16,17 @@ static int CeedQFunctionInit_Poisson2DApply(Ceed ceed, const char *requested, CeedQFunction qf) { // Check QFunction name const char *name = "Poisson2DApply"; + CeedCheck(!strcmp(name, requested), ceed, CEED_ERROR_UNSUPPORTED, "QFunction '%s' does not match requested name: %s", name, requested); // Add QFunction fields const CeedInt dim = 2; + CeedCall(CeedQFunctionAddInput(qf, "du", dim, CEED_EVAL_GRAD)); CeedCall(CeedQFunctionAddInput(qf, "qdata", dim * (dim + 1) / 2, CEED_EVAL_NONE)); CeedCall(CeedQFunctionAddOutput(qf, "dv", dim, CEED_EVAL_GRAD)); CeedCall(CeedQFunctionSetUserFlopsEstimate(qf, 6)); - return CEED_ERROR_SUCCESS; } diff --git a/gallery/poisson/ceed-poisson2dbuild.c b/gallery/poisson/ceed-poisson2dbuild.c index 60a13dd7a6..7768b0d29f 100644 --- a/gallery/poisson/ceed-poisson2dbuild.c +++ b/gallery/poisson/ceed-poisson2dbuild.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -16,16 +16,17 @@ static int CeedQFunctionInit_Poisson2DBuild(Ceed ceed, const char *requested, CeedQFunction qf) { // Check QFunction name const char *name = "Poisson2DBuild"; + CeedCheck(!strcmp(name, requested), ceed, CEED_ERROR_UNSUPPORTED, "QFunction '%s' does not match requested name: %s", name, requested); // Add QFunction fields const CeedInt dim = 2; + CeedCall(CeedQFunctionAddInput(qf, "dx", dim * dim, CEED_EVAL_GRAD)); CeedCall(CeedQFunctionAddInput(qf, "weights", 1, CEED_EVAL_WEIGHT)); CeedCall(CeedQFunctionAddOutput(qf, "qdata", dim * (dim + 1) / 2, CEED_EVAL_NONE)); CeedCall(CeedQFunctionSetUserFlopsEstimate(qf, 17)); - return CEED_ERROR_SUCCESS; } diff --git a/gallery/poisson/ceed-poisson3dapply.c b/gallery/poisson/ceed-poisson3dapply.c index 7af449b13e..bcce1a9dc1 100644 --- a/gallery/poisson/ceed-poisson3dapply.c +++ b/gallery/poisson/ceed-poisson3dapply.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -16,16 +16,17 @@ static int CeedQFunctionInit_Poisson3DApply(Ceed ceed, const char *requested, CeedQFunction qf) { // Check QFunction name const char *name = "Poisson3DApply"; + CeedCheck(!strcmp(name, requested), ceed, CEED_ERROR_UNSUPPORTED, "QFunction '%s' does not match requested name: %s", name, requested); // Add QFunction fields const CeedInt dim = 3; + CeedCall(CeedQFunctionAddInput(qf, "du", dim, CEED_EVAL_GRAD)); CeedCall(CeedQFunctionAddInput(qf, "qdata", dim * (dim + 1) / 2, CEED_EVAL_NONE)); CeedCall(CeedQFunctionAddOutput(qf, "dv", dim, CEED_EVAL_GRAD)); CeedCall(CeedQFunctionSetUserFlopsEstimate(qf, 15)); - return CEED_ERROR_SUCCESS; } diff --git a/gallery/poisson/ceed-poisson3dbuild.c b/gallery/poisson/ceed-poisson3dbuild.c index 5471701b10..3ae866ed7d 100644 --- a/gallery/poisson/ceed-poisson3dbuild.c +++ b/gallery/poisson/ceed-poisson3dbuild.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -16,16 +16,17 @@ static int CeedQFunctionInit_Poisson3DBuild(Ceed ceed, const char *requested, CeedQFunction qf) { // Check QFunction name const char *name = "Poisson3DBuild"; + CeedCheck(!strcmp(name, requested), ceed, CEED_ERROR_UNSUPPORTED, "QFunction '%s' does not match requested name: %s", name, requested); // Add QFunction fields const CeedInt dim = 3; + CeedCall(CeedQFunctionAddInput(qf, "dx", dim * dim, CEED_EVAL_GRAD)); CeedCall(CeedQFunctionAddInput(qf, "weights", 1, CEED_EVAL_WEIGHT)); CeedCall(CeedQFunctionAddOutput(qf, "qdata", dim * (dim + 1) / 2, CEED_EVAL_NONE)); CeedCall(CeedQFunctionSetUserFlopsEstimate(qf, 69)); - return CEED_ERROR_SUCCESS; } diff --git a/gallery/scale/ceed-scale-scalar.c b/gallery/scale/ceed-scale-scalar.c new file mode 100644 index 0000000000..ff950dbf49 --- /dev/null +++ b/gallery/scale/ceed-scale-scalar.c @@ -0,0 +1,31 @@ +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. +// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +// +// SPDX-License-Identifier: BSD-2-Clause +// +// This file is part of CEED: http://github.com/ceed + +#include +#include +#include +#include + +/** + @brief Set fields for vector scaling `CeedQFunction` that scales inputs +**/ +static int CeedQFunctionInit_ScaleScalar(Ceed ceed, const char *requested, CeedQFunction qf) { + // Check QFunction name + const char *name = "Scale (scalar)"; + + CeedCheck(!strcmp(name, requested), ceed, CEED_ERROR_UNSUPPORTED, "QFunction '%s' does not match requested name: %s", name, requested); + + // QFunction fields 'input' and 'output' with requested emodes added by the library rather than being added here + return CEED_ERROR_SUCCESS; +} + +/** + @brief Register scaling `CeedQFunction` +**/ +CEED_INTERN int CeedQFunctionRegister_ScaleScalar(void) { + return CeedQFunctionRegister("Scale (scalar)", ScaleScalar_loc, 1, ScaleScalar, CeedQFunctionInit_ScaleScalar); +} diff --git a/gallery/scale/ceed-scale.c b/gallery/scale/ceed-scale.c index 93fd9be24e..f998ac38e4 100644 --- a/gallery/scale/ceed-scale.c +++ b/gallery/scale/ceed-scale.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -16,10 +16,10 @@ static int CeedQFunctionInit_Scale(Ceed ceed, const char *requested, CeedQFunction qf) { // Check QFunction name const char *name = "Scale"; + CeedCheck(!strcmp(name, requested), ceed, CEED_ERROR_UNSUPPORTED, "QFunction '%s' does not match requested name: %s", name, requested); // QFunction fields 'input' and 'output' with requested emodes added by the library rather than being added here - return CEED_ERROR_SUCCESS; } diff --git a/include/ceed-fortran-name.h b/include/ceed-fortran-name.h index 192356fbc6..1646f3deeb 100644 --- a/include/ceed-fortran-name.h +++ b/include/ceed-fortran-name.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause diff --git a/include/ceed-impl.h b/include/ceed-impl.h index 52b6beb633..e5f8773f37 100644 --- a/include/ceed-impl.h +++ b/include/ceed-impl.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -82,16 +82,37 @@ typedef struct { Ceed delegate; } ObjDelegate; +// Work vector tracking +typedef struct CeedWorkVectors_private *CeedWorkVectors; +struct CeedWorkVectors_private { + CeedInt num_vecs, max_vecs; + bool *is_in_use; + CeedVector *vecs; +}; + +typedef struct CeedObject_private { + Ceed ceed; + int (*View)(CeedObject, FILE *); + int (*Destroy)(CeedObject *); + int ref_count; + CeedInt num_view_tabs; +} CeedObject_private; + struct Ceed_private { - const char *resource; - Ceed delegate; - Ceed parent; - ObjDelegate *obj_delegates; - int obj_delegate_count; - Ceed op_fallback_ceed, op_fallback_parent; - const char *op_fallback_resource; - char **jit_source_roots; - CeedInt num_jit_source_roots; + CeedObject_private obj; + const char *resource; + Ceed delegate; + Ceed parent; + ObjDelegate *obj_delegates; + int obj_delegate_count; + Ceed op_fallback_ceed; + char **jit_source_roots; + char **rust_source_roots; + CeedInt num_rust_source_roots, max_rust_source_roots, num_rust_source_roots_readers; + CeedInt num_jit_source_roots, max_jit_source_roots, num_jit_source_roots_readers; + bool cuda_compile_with_clang; + char **jit_defines; + CeedInt num_jit_defines, max_jit_defines, num_jit_defines_readers; int (*Error)(Ceed, const char *, int, const char *, int, const char *, va_list *); int (*SetStream)(Ceed, void *); int (*GetPreferredMemType)(CeedMemType *); @@ -113,21 +134,22 @@ struct Ceed_private { int (*OperatorCreate)(CeedOperator); int (*OperatorCreateAtPoints)(CeedOperator); int (*CompositeOperatorCreate)(CeedOperator); - int ref_count; - void *data; - bool is_debug; - bool has_valid_op_fallback_resource; - bool is_deterministic; - char err_msg[CEED_MAX_RESOURCE_LEN]; - FOffset *f_offsets; + void *data; + bool is_debug; + bool is_deterministic; + char err_msg[CEED_MAX_RESOURCE_LEN]; + FOffset *f_offsets; + CeedWorkVectors work_vectors; }; struct CeedVector_private { - Ceed ceed; + CeedObject_private obj; int (*HasValidArray)(CeedVector, bool *); int (*HasBorrowedArrayOfType)(CeedVector, CeedMemType, bool *); + int (*CopyStrided)(CeedVector, CeedSize, CeedSize, CeedSize, CeedVector); int (*SetArray)(CeedVector, CeedMemType, CeedCopyMode, CeedScalar *); int (*SetValue)(CeedVector, CeedScalar); + int (*SetValueStrided)(CeedVector, CeedSize, CeedSize, CeedSize, CeedScalar); int (*SyncArray)(CeedVector, CeedMemType); int (*TakeArray)(CeedVector, CeedMemType, CeedScalar **); int (*GetArray)(CeedVector, CeedMemType, CeedScalar **); @@ -142,7 +164,6 @@ struct CeedVector_private { int (*PointwiseMult)(CeedVector, CeedVector, CeedVector); int (*Reciprocal)(CeedVector); int (*Destroy)(CeedVector); - int ref_count; CeedSize length; uint64_t state; uint64_t num_readers; @@ -150,18 +171,18 @@ struct CeedVector_private { }; struct CeedElemRestriction_private { - Ceed ceed; + CeedObject_private obj; CeedElemRestriction rstr_base; int (*Apply)(CeedElemRestriction, CeedTransposeMode, CeedVector, CeedVector, CeedRequest *); int (*ApplyUnsigned)(CeedElemRestriction, CeedTransposeMode, CeedVector, CeedVector, CeedRequest *); int (*ApplyUnoriented)(CeedElemRestriction, CeedTransposeMode, CeedVector, CeedVector, CeedRequest *); int (*ApplyAtPointsInElement)(CeedElemRestriction, CeedInt, CeedTransposeMode, CeedVector, CeedVector, CeedRequest *); int (*ApplyBlock)(CeedElemRestriction, CeedInt, CeedTransposeMode, CeedVector, CeedVector, CeedRequest *); + int (*GetAtPointsElementOffset)(CeedElemRestriction, CeedInt, CeedSize *); int (*GetOffsets)(CeedElemRestriction, CeedMemType, const CeedInt **); int (*GetOrientations)(CeedElemRestriction, CeedMemType, const bool **); int (*GetCurlOrientations)(CeedElemRestriction, CeedMemType, const CeedInt8 **); int (*Destroy)(CeedElemRestriction); - int ref_count; CeedInt num_elem; /* number of elements */ CeedInt elem_size; /* number of nodes per element */ CeedInt num_points; /* number of points, for points restriction */ @@ -181,11 +202,12 @@ struct CeedElemRestriction_private { }; struct CeedBasis_private { - Ceed ceed; + CeedObject_private obj; int (*Apply)(CeedBasis, CeedInt, CeedTransposeMode, CeedEvalMode, CeedVector, CeedVector); - int (*ApplyAtPoints)(CeedBasis, CeedInt, CeedTransposeMode, CeedEvalMode, CeedVector, CeedVector, CeedVector); + int (*ApplyAdd)(CeedBasis, CeedInt, CeedTransposeMode, CeedEvalMode, CeedVector, CeedVector); + int (*ApplyAtPoints)(CeedBasis, CeedInt, const CeedInt *, CeedTransposeMode, CeedEvalMode, CeedVector, CeedVector, CeedVector); + int (*ApplyAddAtPoints)(CeedBasis, CeedInt, const CeedInt *, CeedTransposeMode, CeedEvalMode, CeedVector, CeedVector, CeedVector); int (*Destroy)(CeedBasis); - int ref_count; bool is_tensor_basis; /* flag for tensor basis */ CeedInt dim; /* topological dimension */ CeedElemTopology topo; /* element topology */ @@ -212,11 +234,10 @@ struct CeedBasis_private { }; struct CeedTensorContract_private { - Ceed ceed; + CeedObject_private obj; int (*Apply)(CeedTensorContract, CeedInt, CeedInt, CeedInt, CeedInt, const CeedScalar *restrict, CeedTransposeMode, const CeedInt, const CeedScalar *restrict, CeedScalar *restrict); int (*Destroy)(CeedTensorContract); - int ref_count; void *data; }; @@ -227,12 +248,11 @@ struct CeedQFunctionField_private { }; struct CeedQFunction_private { - Ceed ceed; + CeedObject_private obj; int (*Apply)(CeedQFunction, CeedInt, CeedVector *, CeedVector *); int (*SetCUDAUserFunction)(CeedQFunction, void *); int (*SetHIPUserFunction)(CeedQFunction, void *); int (*Destroy)(CeedQFunction); - int ref_count; CeedInt vec_length; /* Number of quadrature points must be padded to a multiple of vec_length */ CeedQFunctionField *input_fields; CeedQFunctionField *output_fields; @@ -253,8 +273,7 @@ struct CeedQFunction_private { }; struct CeedQFunctionContext_private { - Ceed ceed; - int ref_count; + CeedObject_private obj; int (*HasValidData)(CeedQFunctionContext, bool *); int (*HasBorrowedDataOfType)(CeedQFunctionContext, CeedMemType, bool *); int (*SetData)(CeedQFunctionContext, CeedMemType, CeedCopyMode, void *); @@ -329,9 +348,8 @@ struct CeedOperatorAssemblyData_private { }; struct CeedOperator_private { - Ceed ceed; - CeedOperator op_fallback, op_fallback_parent; - int ref_count; + CeedObject_private obj; + CeedOperator op_fallback, op_fallback_parent; int (*LinearAssembleQFunction)(CeedOperator, CeedVector *, CeedElemRestriction *, CeedRequest *); int (*LinearAssembleQFunctionUpdate)(CeedOperator, CeedVector, CeedElemRestriction, CeedRequest *); int (*LinearAssembleDiagonal)(CeedOperator, CeedVector, CeedRequest *); @@ -364,6 +382,7 @@ struct CeedOperator_private { bool is_composite; bool is_at_points; bool has_restriction; + bool is_sequential; CeedQFunctionAssemblyData qf_assembled; CeedOperatorAssemblyData op_assembled; CeedOperator *sub_operators; diff --git a/include/ceed.h b/include/ceed.h index effe28eaf1..b905b30851 100644 --- a/include/ceed.h +++ b/include/ceed.h @@ -1 +1,5 @@ +#ifdef CEED_RUNNING_JIT_PASS +#include "ceed/types.h" +#else #include "ceed/ceed.h" +#endif diff --git a/include/ceed/backend.h b/include/ceed/backend.h index 72868c1ff0..87f5f32c25 100644 --- a/include/ceed/backend.h +++ b/include/ceed/backend.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -74,8 +74,12 @@ #define CeedPragmaCritical(x) CeedPragmaOMP(critical(x)) #endif +/// This macro provides the tab width for viewing Ceed objects. +/// @ingroup Ceed +#define CEED_TAB_WIDTH 2 + /** - This enum supples common colors for CeedDebug256 debugging output. + This enum supplies common colors for CeedDebug256 debugging output. Set the environment variable `CEED_DEBUG = 1` to activate debugging output. @ingroup Ceed @@ -142,8 +146,10 @@ CEED_EXTERN bool CeedDebugFlagEnv(void); @ingroup Ceed @ref Backend **/ -#define CeedWarn(...) \ - { CeedDebugImpl256(CEED_DEBUG_COLOR_WARNING, ##__VA_ARGS__); } +#define CeedWarn(...) \ + { \ + CeedDebugImpl256(CEED_DEBUG_COLOR_WARNING, ##__VA_ARGS__); \ + } /** Swap the values of two CeedScalars @@ -180,6 +186,11 @@ CEED_INTERN int CeedReallocArray(size_t n, size_t unit, void *p); CEED_INTERN int CeedStringAllocCopy(const char *source, char **copy); CEED_INTERN int CeedFree(void *p); +CEED_INTERN int CeedObjectCreate(Ceed ceed, int (*view_function)(CeedObject, FILE *), int (*destroy_function)(CeedObject *), CeedObject obj); +CEED_INTERN int CeedObjectReference(CeedObject obj); +CEED_INTERN int CeedObjectDereference(CeedObject obj); +CEED_INTERN int CeedObjectDestroy_Private(CeedObject obj); + CEED_INTERN int CeedSetHostBoolArray(const bool *source_array, CeedCopyMode copy_mode, CeedSize num_values, const bool **target_array_owned, const bool **target_array_borrowed, const bool **target_array); CEED_INTERN int CeedSetHostCeedInt8Array(const CeedInt8 *source_array, CeedCopyMode copy_mode, CeedSize num_values, @@ -244,14 +255,23 @@ CEED_EXTERN int CeedGetDelegate(Ceed ceed, Ceed *delegate); CEED_EXTERN int CeedSetDelegate(Ceed ceed, Ceed delegate); CEED_EXTERN int CeedGetObjectDelegate(Ceed ceed, Ceed *delegate, const char *obj_name); CEED_EXTERN int CeedSetObjectDelegate(Ceed ceed, Ceed delegate, const char *obj_name); -CEED_EXTERN int CeedGetOperatorFallbackResource(Ceed ceed, const char **resource); CEED_EXTERN int CeedGetOperatorFallbackCeed(Ceed ceed, Ceed *fallback_ceed); -CEED_EXTERN int CeedSetOperatorFallbackResource(Ceed ceed, const char *resource); +CEED_EXTERN int CeedSetOperatorFallbackCeed(Ceed ceed, Ceed fallback_ceed); CEED_EXTERN int CeedSetDeterministic(Ceed ceed, bool is_deterministic); CEED_EXTERN int CeedSetBackendFunctionImpl(Ceed ceed, const char *type, void *object, const char *func_name, void (*f)(void)); CEED_EXTERN int CeedGetData(Ceed ceed, void *data); CEED_EXTERN int CeedSetData(Ceed ceed, void *data); CEED_EXTERN int CeedReference(Ceed ceed); +CEED_EXTERN int CeedGetWorkVector(Ceed ceed, CeedSize len, CeedVector *vec); +CEED_EXTERN int CeedRestoreWorkVector(Ceed ceed, CeedVector *vec); +CEED_EXTERN int CeedClearWorkVectors(Ceed ceed, CeedSize min_len); +CEED_EXTERN int CeedGetWorkVectorMemoryUsage(Ceed ceed, CeedScalar *usage_mb); +CEED_EXTERN int CeedGetJitSourceRoots(Ceed ceed, CeedInt *num_source_roots, const char ***jit_source_roots); +CEED_EXTERN int CeedGetRustSourceRoots(Ceed ceed, CeedInt *num_source_roots, const char ***rust_source_roots); +CEED_EXTERN int CeedRestoreJitSourceRoots(Ceed ceed, const char ***jit_source_roots); +CEED_EXTERN int CeedRestoreRustSourceRoots(Ceed ceed, const char ***rust_source_roots); +CEED_EXTERN int CeedGetJitDefines(Ceed ceed, CeedInt *num_defines, const char ***jit_defines); +CEED_EXTERN int CeedRestoreJitDefines(Ceed ceed, const char ***jit_defines); CEED_EXTERN int CeedVectorHasValidArray(CeedVector vec, bool *has_valid_array); CEED_EXTERN int CeedVectorHasBorrowedArrayOfType(CeedVector vec, CeedMemType mem_type, bool *has_borrowed_array_of_type); @@ -261,8 +281,12 @@ CEED_EXTERN int CeedVectorGetData(CeedVector vec, void *data); CEED_EXTERN int CeedVectorSetData(CeedVector vec, void *data); CEED_EXTERN int CeedVectorReference(CeedVector vec); -/// Type of element restriction; -/// @ingroup CeedElemRestriction +/** + Specify type of restriction operation. + + @ingroup CeedElemRestriction + @ref Backend +**/ typedef enum { /// Standard element restriction with offsets CEED_RESTRICTION_STANDARD = 1, @@ -278,7 +302,7 @@ typedef enum { CEED_EXTERN int CeedElemRestrictionGetType(CeedElemRestriction rstr, CeedRestrictionType *rstr_type); CEED_EXTERN int CeedElemRestrictionIsStrided(CeedElemRestriction rstr, bool *is_strided); -CEED_EXTERN int CeedElemRestrictionIsPoints(CeedElemRestriction rstr, bool *is_points); +CEED_EXTERN int CeedElemRestrictionIsAtPoints(CeedElemRestriction rstr, bool *is_points); CEED_EXTERN int CeedElemRestrictionAtPointsAreCompatible(CeedElemRestriction rstr_a, CeedElemRestriction rstr_b, bool *are_compatible); CEED_EXTERN int CeedElemRestrictionGetStrides(CeedElemRestriction rstr, CeedInt strides[3]); CEED_EXTERN int CeedElemRestrictionHasBackendStrides(CeedElemRestriction rstr, bool *has_backend_strides); @@ -292,13 +316,19 @@ CEED_EXTERN int CeedElemRestrictionGetLLayout(CeedElemRestriction rstr, CeedInt CEED_EXTERN int CeedElemRestrictionSetLLayout(CeedElemRestriction rstr, CeedInt layout[3]); CEED_EXTERN int CeedElemRestrictionGetELayout(CeedElemRestriction rstr, CeedInt layout[3]); CEED_EXTERN int CeedElemRestrictionSetELayout(CeedElemRestriction rstr, CeedInt layout[3]); +CEED_EXTERN int CeedElemRestrictionGetAtPointsElementOffset(CeedElemRestriction rstr, CeedInt elem, CeedSize *elem_offset); +CEED_EXTERN int CeedElemRestrictionSetAtPointsEVectorSize(CeedElemRestriction rstr, CeedSize e_size); CEED_EXTERN int CeedElemRestrictionGetData(CeedElemRestriction rstr, void *data); CEED_EXTERN int CeedElemRestrictionSetData(CeedElemRestriction rstr, void *data); CEED_EXTERN int CeedElemRestrictionReference(CeedElemRestriction rstr); CEED_EXTERN int CeedElemRestrictionGetFlopsEstimate(CeedElemRestriction rstr, CeedTransposeMode t_mode, CeedSize *flops); -/// Type of FE space; -/// @ingroup CeedBasis +/** + Specify type of FE space. + + @ingroup CeedBasis + @ref Backend +**/ typedef enum { /// H^1 FE space CEED_FE_SPACE_H1 = 1, @@ -310,16 +340,22 @@ typedef enum { CEED_EXTERN const char *const CeedFESpaces[]; CEED_EXTERN int CeedBasisGetCollocatedGrad(CeedBasis basis, CeedScalar *colo_grad_1d); +CEED_EXTERN int CeedBasisGetChebyshevInterp1D(CeedBasis basis, CeedScalar *chebyshev_interp_1d); CEED_EXTERN int CeedBasisIsTensor(CeedBasis basis, bool *is_tensor); +CEED_EXTERN int CeedBasisIsCollocated(CeedBasis basis, bool *is_collocated); CEED_EXTERN int CeedBasisGetData(CeedBasis basis, void *data); CEED_EXTERN int CeedBasisSetData(CeedBasis basis, void *data); CEED_EXTERN int CeedBasisReference(CeedBasis basis); CEED_EXTERN int CeedBasisGetNumQuadratureComponents(CeedBasis basis, CeedEvalMode eval_mode, CeedInt *q_comp); -CEED_EXTERN int CeedBasisGetFlopsEstimate(CeedBasis basis, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedSize *flops); +CEED_EXTERN int CeedBasisGetFlopsEstimate(CeedBasis basis, CeedTransposeMode t_mode, CeedEvalMode eval_mode, bool is_at_points, CeedInt num_points, + CeedSize *flops); CEED_EXTERN int CeedBasisGetFESpace(CeedBasis basis, CeedFESpace *fe_space); CEED_EXTERN int CeedBasisGetTopologyDimension(CeedElemTopology topo, CeedInt *dim); CEED_EXTERN int CeedBasisGetTensorContract(CeedBasis basis, CeedTensorContract *contract); CEED_EXTERN int CeedBasisSetTensorContract(CeedBasis basis, CeedTensorContract contract); +CEED_EXTERN int CeedBasisCreateH1Fallback(Ceed ceed, CeedElemTopology topo, CeedInt num_comp, CeedInt num_nodes, CeedInt nqpts, + const CeedScalar *interp, const CeedScalar *grad, const CeedScalar *q_ref, const CeedScalar *q_weights, + CeedBasis basis); CEED_EXTERN int CeedTensorContractCreate(Ceed ceed, CeedTensorContract *contract); CEED_EXTERN int CeedTensorContractApply(CeedTensorContract contract, CeedInt A, CeedInt B, CeedInt C, CeedInt J, const CeedScalar *__restrict__ t, @@ -341,6 +377,7 @@ CEED_EXTERN int CeedQFunctionSetFortranStatus(CeedQFunction qf, bool status); CEED_EXTERN int CeedQFunctionGetVectorLength(CeedQFunction qf, CeedInt *vec_length); CEED_EXTERN int CeedQFunctionGetNumArgs(CeedQFunction qf, CeedInt *num_input_fields, CeedInt *num_output_fields); CEED_EXTERN int CeedQFunctionGetKernelName(CeedQFunction qf, const char **kernel_name); +CEED_EXTERN int CeedQFunctionGetName(CeedQFunction qf, const char **name); CEED_EXTERN int CeedQFunctionGetSourcePath(CeedQFunction qf, const char **source_path); CEED_EXTERN int CeedQFunctionLoadSourceToBuffer(CeedQFunction qf, const char **source_buffer); CEED_EXTERN int CeedQFunctionGetUserFunction(CeedQFunction qf, CeedQFunctionUser *f); @@ -390,6 +427,8 @@ CEED_EXTERN int CeedQFunctionContextReference(CeedQFunctionContext ctx); CEED_EXTERN int CeedOperatorGetBasisPointer(CeedBasis basis, CeedEvalMode eval_mode, const CeedScalar *identity, const CeedScalar **basis_ptr); CEED_EXTERN int CeedOperatorCreateActivePointBlockRestriction(CeedElemRestriction rstr, CeedElemRestriction *pointblock_rstr); + +CEED_EXTERN int CeedOperatorGetQFunctionAssemblyData(CeedOperator op, CeedQFunctionAssemblyData *data); CEED_EXTERN int CeedQFunctionAssemblyDataCreate(Ceed ceed, CeedQFunctionAssemblyData *data); CEED_EXTERN int CeedQFunctionAssemblyDataReference(CeedQFunctionAssemblyData data); CEED_EXTERN int CeedQFunctionAssemblyDataSetReuse(CeedQFunctionAssemblyData data, bool reuse_assembly_data); @@ -401,6 +440,7 @@ CEED_EXTERN int CeedQFunctionAssemblyDataSetObjects(CeedQFunctionAssemblyData da CEED_EXTERN int CeedQFunctionAssemblyDataGetObjects(CeedQFunctionAssemblyData data, CeedVector *vec, CeedElemRestriction *rstr); CEED_EXTERN int CeedQFunctionAssemblyDataDestroy(CeedQFunctionAssemblyData *data); +CEED_EXTERN int CeedOperatorGetOperatorAssemblyData(CeedOperator op, CeedOperatorAssemblyData *data); CEED_EXTERN int CeedOperatorAssemblyDataCreate(Ceed ceed, CeedOperator op, CeedOperatorAssemblyData *data); CEED_EXTERN int CeedOperatorAssemblyDataGetEvalModes(CeedOperatorAssemblyData data, CeedInt *num_active_bases_in, CeedInt **num_eval_modes_in, const CeedEvalMode ***eval_modes_in, CeedSize ***eval_mode_offsets_in, @@ -415,7 +455,6 @@ CEED_EXTERN int CeedOperatorAssemblyDataGetElemRestrictions(CeedOperatorAssembly CeedElemRestriction **active_elem_rstrs_out); CEED_EXTERN int CeedOperatorAssemblyDataDestroy(CeedOperatorAssemblyData *data); -CEED_EXTERN int CeedOperatorGetOperatorAssemblyData(CeedOperator op, CeedOperatorAssemblyData *data); CEED_EXTERN int CeedOperatorGetActiveBasis(CeedOperator op, CeedBasis *active_basis); CEED_EXTERN int CeedOperatorGetActiveBases(CeedOperator op, CeedBasis *active_input_basis, CeedBasis *active_output_basis); CEED_EXTERN int CeedOperatorGetActiveElemRestriction(CeedOperator op, CeedElemRestriction *active_rstr); @@ -433,6 +472,9 @@ CEED_EXTERN int CeedOperatorReference(CeedOperator op); CEED_EXTERN int CeedOperatorGetFallback(CeedOperator op, CeedOperator *op_fallback); CEED_EXTERN int CeedOperatorGetFallbackParent(CeedOperator op, CeedOperator *parent); CEED_EXTERN int CeedOperatorGetFallbackParentCeed(CeedOperator op, Ceed *parent); +CEED_EXTERN int CeedOperatorLinearAssembleQFunctionBuildOrUpdateFallback(CeedOperator op, CeedVector *assembled, CeedElemRestriction *rstr, + CeedRequest *request); +CEED_INTERN int CeedOperatorAssembleSingle(CeedOperator op, CeedInt offset, CeedVector values); CEED_EXTERN int CeedOperatorSetSetupDone(CeedOperator op); CEED_INTERN int CeedMatrixMatrixMultiply(Ceed ceed, const CeedScalar *mat_A, const CeedScalar *mat_B, CeedScalar *mat_C, CeedInt m, CeedInt n, diff --git a/include/ceed/ceed-f32.h b/include/ceed/ceed-f32.h index e605c47a4b..ede7251d81 100644 --- a/include/ceed/ceed-f32.h +++ b/include/ceed/ceed-f32.h @@ -1,4 +1,4 @@ -/// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +/// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. /// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. /// /// SPDX-License-Identifier: BSD-2-Clause @@ -8,8 +8,9 @@ /// @file /// Public header for definitions related to using FP32 floating point (single precision) for CeedScalar. /// Include this header in ceed.h to use float instead of double. -#ifndef CEED_F32_H -#define CEED_F32_H +#pragma once + +#define CEED_SCALAR_IS_FP32 /// Set base scalar type to FP32. (See CeedScalarType enum in ceed.h for all options.) #define CEED_SCALAR_TYPE CEED_SCALAR_FP32 @@ -17,5 +18,3 @@ typedef float CeedScalar; /// Machine epsilon #define CEED_EPSILON 6e-08 - -#endif // CEED_F32_H diff --git a/include/ceed/ceed-f64.h b/include/ceed/ceed-f64.h index 3e6876cc19..88e37972f9 100644 --- a/include/ceed/ceed-f64.h +++ b/include/ceed/ceed-f64.h @@ -1,4 +1,4 @@ -/// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +/// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. /// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. /// /// SPDX-License-Identifier: BSD-2-Clause @@ -8,8 +8,9 @@ /// @file /// Public header for definitions related to using FP64 floating point (double precision) for CeedScalar. /// This is the default header included in ceed.h. -#ifndef CEED_F64_H -#define CEED_F64_H +#pragma once + +#define CEED_SCALAR_IS_FP64 /// Set base scalar type to FP64. (See CeedScalarType enum in ceed.h for all options.) #define CEED_SCALAR_TYPE CEED_SCALAR_FP64 @@ -17,5 +18,3 @@ typedef double CeedScalar; /// Machine epsilon #define CEED_EPSILON 1e-16 - -#endif // CEED_F64_H diff --git a/include/ceed/ceed.h b/include/ceed/ceed.h index 9ebb40534d..a76b9238c3 100644 --- a/include/ceed/ceed.h +++ b/include/ceed/ceed.h @@ -1,4 +1,4 @@ -/// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +/// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. /// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. /// /// SPDX-License-Identifier: BSD-2-Clause @@ -99,6 +99,16 @@ typedef struct CeedContextFieldLabel_private *CeedContextFieldLabel; /// Given an element restriction \f$E\f$, basis evaluator \f$B\f$, and quadrature function\f$f\f$, a `CeedOperator` expresses operations of the form \f$E^T B^T f(B E u)\f$ acting on the vector \f$u\f$. /// @ingroup CeedOperatorUser typedef struct CeedOperator_private *CeedOperator; +/// Generic type for all libCEED objects to support common functionality, such as viewing +/// @ingroup CeedUser +typedef struct CeedObject_private *CeedObject; + +CEED_EXTERN int CeedObjectView(CeedObject obj, FILE *stream); +CEED_EXTERN int CeedObjectSetNumViewTabs(CeedObject obj, CeedInt num_tabs); +CEED_EXTERN int CeedObjectGetNumViewTabs(CeedObject obj, CeedInt *num_tabs); +CEED_EXTERN int CeedObjectGetCeed(CeedObject obj, Ceed *ceed); +CEED_EXTERN Ceed CeedObjectReturnCeed(CeedObject obj); +CEED_EXTERN int CeedObjectDestroy(CeedObject *obj); CEED_EXTERN int CeedRegistryGetList(size_t *n, char ***const resources, CeedInt **array); CEED_EXTERN int CeedInit(const char *resource, Ceed *ceed); @@ -107,6 +117,10 @@ CEED_EXTERN int CeedReferenceCopy(Ceed ceed, Ceed *ceed_copy); CEED_EXTERN int CeedGetResource(Ceed ceed, const char **resource); CEED_EXTERN int CeedIsDeterministic(Ceed ceed, bool *is_deterministic); CEED_EXTERN int CeedAddJitSourceRoot(Ceed ceed, const char *jit_source_root); +CEED_EXTERN int CeedAddRustSourceRoot(Ceed ceed, const char *rust_source_root); +CEED_EXTERN int CeedAddJitDefine(Ceed ceed, const char *jit_define); +CEED_EXTERN int CeedSetNumViewTabs(Ceed ceed, CeedInt num_tabs); +CEED_EXTERN int CeedGetNumViewTabs(Ceed ceed, CeedInt *num_tabs); CEED_EXTERN int CeedView(Ceed ceed, FILE *stream); CEED_EXTERN int CeedDestroy(Ceed *ceed); CEED_EXTERN int CeedErrorImpl(Ceed ceed, const char *filename, int lineno, const char *func, int ecode, const char *format, ...); @@ -162,6 +176,11 @@ CEED_EXTERN int CeedErrorExit(Ceed ceed, const char *filename, int line_no, cons (CEED_VERSION_MAJOR == major && (CEED_VERSION_MINOR > minor || (CEED_VERSION_MINOR == minor && CEED_VERSION_PATCH >= patch))))) CEED_EXTERN int CeedGetVersion(int *major, int *minor, int *patch, bool *release); +CEED_EXTERN int CeedGetGitVersion(const char **git_version); +CEED_EXTERN int CeedGetBuildConfiguration(const char **build_config); + +CEED_EXTERN int CeedSetIsClang(Ceed ceed, bool isClang); +CEED_EXTERN int CeedGetIsClang(Ceed ceed, bool *isClang); CEED_EXTERN int CeedGetScalarType(CeedScalarType *scalar_type); @@ -180,8 +199,10 @@ CEED_EXTERN int CeedGetPreferredMemType(Ceed ceed, CeedMemType *type); CEED_EXTERN int CeedVectorCreate(Ceed ceed, CeedSize len, CeedVector *vec); CEED_EXTERN int CeedVectorReferenceCopy(CeedVector vec, CeedVector *vec_copy); CEED_EXTERN int CeedVectorCopy(CeedVector vec, CeedVector vec_copy); +CEED_EXTERN int CeedVectorCopyStrided(CeedVector vec, CeedSize start, CeedSize stop, CeedSize step, CeedVector vec_copy); CEED_EXTERN int CeedVectorSetArray(CeedVector vec, CeedMemType mem_type, CeedCopyMode copy_mode, CeedScalar *array); CEED_EXTERN int CeedVectorSetValue(CeedVector vec, CeedScalar value); +CEED_EXTERN int CeedVectorSetValueStrided(CeedVector vec, CeedSize start, CeedSize stop, CeedSize step, CeedScalar value); CEED_EXTERN int CeedVectorSyncArray(CeedVector vec, CeedMemType mem_type); CEED_EXTERN int CeedVectorTakeArray(CeedVector vec, CeedMemType mem_type, CeedScalar **array); CEED_EXTERN int CeedVectorGetArray(CeedVector vec, CeedMemType mem_type, CeedScalar **array); @@ -195,6 +216,8 @@ CEED_EXTERN int CeedVectorAXPY(CeedVector y, CeedScalar alpha, CeedVector x); CEED_EXTERN int CeedVectorAXPBY(CeedVector y, CeedScalar alpha, CeedScalar beta, CeedVector x); CEED_EXTERN int CeedVectorPointwiseMult(CeedVector w, CeedVector x, CeedVector y); CEED_EXTERN int CeedVectorReciprocal(CeedVector vec); +CEED_EXTERN int CeedVectorSetNumViewTabs(CeedVector vec, CeedInt num_tabs); +CEED_EXTERN int CeedVectorGetNumViewTabs(CeedVector vec, CeedInt *num_tabs); CEED_EXTERN int CeedVectorViewRange(CeedVector vec, CeedSize start, CeedSize stop, CeedInt step, const char *fp_fmt, FILE *stream); CEED_EXTERN int CeedVectorView(CeedVector vec, const char *fp_fmt, FILE *stream); CEED_EXTERN int CeedVectorGetCeed(CeedVector vec, Ceed *ceed); @@ -276,12 +299,16 @@ CEED_EXTERN int CeedElemRestrictionGetElementSize(CeedElemRestriction rstr, Cee CEED_EXTERN int CeedElemRestrictionGetNumPoints(CeedElemRestriction rstr, CeedInt *num_points); CEED_EXTERN int CeedElemRestrictionGetNumPointsInElement(CeedElemRestriction rstr, CeedInt elem, CeedInt *num_points); CEED_EXTERN int CeedElemRestrictionGetMaxPointsInElement(CeedElemRestriction rstr, CeedInt *max_points); +CEED_EXTERN int CeedElemRestrictionGetMinPointsInElement(CeedElemRestriction rstr, CeedInt *min_points); +CEED_EXTERN int CeedElemRestrictionGetMinMaxPointsInElement(CeedElemRestriction rstr, CeedInt *min_points, CeedInt *max_points); CEED_EXTERN int CeedElemRestrictionGetLVectorSize(CeedElemRestriction rstr, CeedSize *l_size); CEED_EXTERN int CeedElemRestrictionGetEVectorSize(CeedElemRestriction rstr, CeedSize *e_size); CEED_EXTERN int CeedElemRestrictionGetNumComponents(CeedElemRestriction rstr, CeedInt *num_comp); CEED_EXTERN int CeedElemRestrictionGetNumBlocks(CeedElemRestriction rstr, CeedInt *num_block); CEED_EXTERN int CeedElemRestrictionGetBlockSize(CeedElemRestriction rstr, CeedInt *block_size); CEED_EXTERN int CeedElemRestrictionGetMultiplicity(CeedElemRestriction rstr, CeedVector mult); +CEED_EXTERN int CeedElemRestrictionSetNumViewTabs(CeedElemRestriction rstr, CeedInt num_tabs); +CEED_EXTERN int CeedElemRestrictionGetNumViewTabs(CeedElemRestriction rstr, CeedInt *num_tabs); CEED_EXTERN int CeedElemRestrictionView(CeedElemRestriction rstr, FILE *stream); CEED_EXTERN int CeedElemRestrictionDestroy(CeedElemRestriction *rstr); @@ -289,23 +316,28 @@ CEED_EXTERN int CeedElemRestrictionDestroy(CeedElemRestriction *rstr); // \int_\Omega v^T f_0(u, \nabla u, qdata) + (\nabla v)^T f_1(u, \nabla u, qdata) // where gradients are with respect to the reference element. -CEED_EXTERN int CeedBasisCreateTensorH1Lagrange(Ceed ceed, CeedInt dim, CeedInt num_comp, CeedInt P, CeedInt Q, CeedQuadMode quad_mode, - CeedBasis *basis); -CEED_EXTERN int CeedBasisCreateTensorH1(Ceed ceed, CeedInt dim, CeedInt num_comp, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, - const CeedScalar *grad_1d, const CeedScalar *q_ref_1d, const CeedScalar *q_weight_1d, CeedBasis *basis); -CEED_EXTERN int CeedBasisCreateH1(Ceed ceed, CeedElemTopology topo, CeedInt num_comp, CeedInt num_nodes, CeedInt nqpts, const CeedScalar *interp, - const CeedScalar *grad, const CeedScalar *q_ref, const CeedScalar *q_weights, CeedBasis *basis); -CEED_EXTERN int CeedBasisCreateHdiv(Ceed ceed, CeedElemTopology topo, CeedInt num_comp, CeedInt num_nodes, CeedInt nqpts, const CeedScalar *interp, - const CeedScalar *div, const CeedScalar *q_ref, const CeedScalar *q_weights, CeedBasis *basis); -CEED_EXTERN int CeedBasisCreateHcurl(Ceed ceed, CeedElemTopology topo, CeedInt num_comp, CeedInt num_nodes, CeedInt nqpts, const CeedScalar *interp, - const CeedScalar *curl, const CeedScalar *q_ref, const CeedScalar *q_weights, CeedBasis *basis); -CEED_EXTERN int CeedBasisCreateProjection(CeedBasis basis_from, CeedBasis basis_to, CeedBasis *basis_project); -CEED_EXTERN int CeedBasisReferenceCopy(CeedBasis basis, CeedBasis *basis_copy); -CEED_EXTERN int CeedBasisView(CeedBasis basis, FILE *stream); -CEED_EXTERN int CeedBasisApply(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, CeedVector v); -CEED_EXTERN int CeedBasisApplyAtPoints(CeedBasis basis, CeedInt num_points, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector x_ref, - CeedVector u, CeedVector v); -CEED_EXTERN int CeedBasisGetCeed(CeedBasis basis, Ceed *ceed); +CEED_EXTERN int CeedBasisCreateTensorH1Lagrange(Ceed ceed, CeedInt dim, CeedInt num_comp, CeedInt P, CeedInt Q, CeedQuadMode quad_mode, + CeedBasis *basis); +CEED_EXTERN int CeedBasisCreateTensorH1(Ceed ceed, CeedInt dim, CeedInt num_comp, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, + const CeedScalar *grad_1d, const CeedScalar *q_ref_1d, const CeedScalar *q_weight_1d, CeedBasis *basis); +CEED_EXTERN int CeedBasisCreateH1(Ceed ceed, CeedElemTopology topo, CeedInt num_comp, CeedInt num_nodes, CeedInt nqpts, const CeedScalar *interp, + const CeedScalar *grad, const CeedScalar *q_ref, const CeedScalar *q_weights, CeedBasis *basis); +CEED_EXTERN int CeedBasisCreateHdiv(Ceed ceed, CeedElemTopology topo, CeedInt num_comp, CeedInt num_nodes, CeedInt nqpts, const CeedScalar *interp, + const CeedScalar *div, const CeedScalar *q_ref, const CeedScalar *q_weights, CeedBasis *basis); +CEED_EXTERN int CeedBasisCreateHcurl(Ceed ceed, CeedElemTopology topo, CeedInt num_comp, CeedInt num_nodes, CeedInt nqpts, const CeedScalar *interp, + const CeedScalar *curl, const CeedScalar *q_ref, const CeedScalar *q_weights, CeedBasis *basis); +CEED_EXTERN int CeedBasisCreateProjection(CeedBasis basis_from, CeedBasis basis_to, CeedBasis *basis_project); +CEED_EXTERN int CeedBasisReferenceCopy(CeedBasis basis, CeedBasis *basis_copy); +CEED_EXTERN int CeedBasisSetNumViewTabs(CeedBasis basis, CeedInt num_tabs); +CEED_EXTERN int CeedBasisGetNumViewTabs(CeedBasis basis, CeedInt *num_tabs); +CEED_EXTERN int CeedBasisView(CeedBasis basis, FILE *stream); +CEED_EXTERN int CeedBasisApply(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, CeedVector v); +CEED_EXTERN int CeedBasisApplyAdd(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, CeedVector v); +CEED_EXTERN int CeedBasisApplyAtPoints(CeedBasis basis, CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, CeedEvalMode eval_mode, + CeedVector x_ref, CeedVector u, CeedVector v); +CEED_EXTERN int CeedBasisApplyAddAtPoints(CeedBasis basis, CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, + CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v); +CEED_EXTERN int CeedBasisGetCeed(CeedBasis basis, Ceed *ceed); CEED_EXTERN Ceed CeedBasisReturnCeed(CeedBasis basis); CEED_EXTERN int CeedBasisGetDimension(CeedBasis basis, CeedInt *dim); CEED_EXTERN int CeedBasisGetTopology(CeedBasis basis, CeedElemTopology *topo); @@ -354,16 +386,17 @@ CEED_EXTERN int CeedQFunctionGetFields(CeedQFunction qf, CeedInt *num_input_fie CEED_EXTERN int CeedQFunctionSetContext(CeedQFunction qf, CeedQFunctionContext ctx); CEED_EXTERN int CeedQFunctionSetContextWritable(CeedQFunction qf, bool is_writable); CEED_EXTERN int CeedQFunctionSetUserFlopsEstimate(CeedQFunction qf, CeedSize flops); +CEED_EXTERN int CeedQFunctionSetNumViewTabs(CeedQFunction qf, CeedInt num_tabs); +CEED_EXTERN int CeedQFunctionGetNumViewTabs(CeedQFunction qf, CeedInt *num_tabs); CEED_EXTERN int CeedQFunctionView(CeedQFunction qf, FILE *stream); CEED_EXTERN int CeedQFunctionGetCeed(CeedQFunction qf, Ceed *ceed); CEED_EXTERN Ceed CeedQFunctionReturnCeed(CeedQFunction qf); CEED_EXTERN int CeedQFunctionApply(CeedQFunction qf, CeedInt Q, CeedVector *u, CeedVector *v); CEED_EXTERN int CeedQFunctionDestroy(CeedQFunction *qf); - -CEED_EXTERN int CeedQFunctionFieldGetName(CeedQFunctionField qf_field, const char **field_name); -CEED_EXTERN int CeedQFunctionFieldGetSize(CeedQFunctionField qf_field, CeedInt *size); -CEED_EXTERN int CeedQFunctionFieldGetEvalMode(CeedQFunctionField qf_field, CeedEvalMode *eval_mode); -CEED_EXTERN int CeedQFunctionFieldGetData(CeedQFunctionField qf_field, const char **field_name, CeedInt *size, CeedEvalMode *eval_mode); +CEED_EXTERN int CeedQFunctionFieldGetName(CeedQFunctionField qf_field, const char **field_name); +CEED_EXTERN int CeedQFunctionFieldGetSize(CeedQFunctionField qf_field, CeedInt *size); +CEED_EXTERN int CeedQFunctionFieldGetEvalMode(CeedQFunctionField qf_field, CeedEvalMode *eval_mode); +CEED_EXTERN int CeedQFunctionFieldGetData(CeedQFunctionField qf_field, const char **field_name, CeedInt *size, CeedEvalMode *eval_mode); /** Handle for the user provided @ref CeedQFunctionContextDestroy() callback function @@ -393,13 +426,15 @@ CEED_EXTERN int CeedQFunctionContextGetAllFieldLabels(CeedQFunctionContext ctx, CEED_EXTERN int CeedContextFieldLabelGetDescription(CeedContextFieldLabel label, const char **field_name, size_t *field_offset, size_t *num_values, const char **field_description, CeedContextFieldType *field_type); CEED_EXTERN int CeedQFunctionContextGetContextSize(CeedQFunctionContext ctx, size_t *ctx_size); +CEED_EXTERN int CeedQFunctionContextSetNumViewTabs(CeedQFunctionContext ctx, CeedInt num_tabs); +CEED_EXTERN int CeedQFunctionContextGetNumViewTabs(CeedQFunctionContext ctx, CeedInt *num_tabs); CEED_EXTERN int CeedQFunctionContextView(CeedQFunctionContext ctx, FILE *stream); CEED_EXTERN int CeedQFunctionContextSetDataDestroy(CeedQFunctionContext ctx, CeedMemType f_mem_type, CeedQFunctionContextDataDestroyUser f); CEED_EXTERN int CeedQFunctionContextDestroy(CeedQFunctionContext *ctx); CEED_EXTERN int CeedOperatorCreate(Ceed ceed, CeedQFunction qf, CeedQFunction dqf, CeedQFunction dqfT, CeedOperator *op); CEED_EXTERN int CeedOperatorCreateAtPoints(Ceed ceed, CeedQFunction qf, CeedQFunction dqf, CeedQFunction dqfT, CeedOperator *op); -CEED_EXTERN int CeedCompositeOperatorCreate(Ceed ceed, CeedOperator *op); +CEED_EXTERN int CeedOperatorCreateComposite(Ceed ceed, CeedOperator *op); CEED_EXTERN int CeedOperatorReferenceCopy(CeedOperator op, CeedOperator *op_copy); CEED_EXTERN int CeedOperatorSetField(CeedOperator op, const char *field_name, CeedElemRestriction rstr, CeedBasis basis, CeedVector vec); CEED_EXTERN int CeedOperatorGetFields(CeedOperator op, CeedInt *num_input_fields, CeedOperatorField **input_fields, CeedInt *num_output_fields, @@ -408,9 +443,12 @@ CEED_EXTERN int CeedOperatorGetFields(CeedOperator op, CeedInt *num_input_fields CEED_EXTERN int CeedOperatorAtPointsSetPoints(CeedOperator op, CeedElemRestriction rstr_points, CeedVector point_coords); CEED_EXTERN int CeedOperatorAtPointsGetPoints(CeedOperator op, CeedElemRestriction *rstr_points, CeedVector *point_coords); CEED_EXTERN int CeedOperatorIsAtPoints(CeedOperator op, bool *is_at_points); -CEED_EXTERN int CeedCompositeOperatorAddSub(CeedOperator composite_op, CeedOperator sub_op); -CEED_EXTERN int CeedCompositeOperatorGetNumSub(CeedOperator op, CeedInt *num_suboperators); -CEED_EXTERN int CeedCompositeOperatorGetSubList(CeedOperator op, CeedOperator **sub_operators); +CEED_EXTERN int CeedOperatorCompositeAddSub(CeedOperator composite_op, CeedOperator sub_op); +CEED_EXTERN int CeedOperatorCompositeGetNumSub(CeedOperator op, CeedInt *num_suboperators); +CEED_EXTERN int CeedOperatorCompositeGetSubList(CeedOperator op, CeedOperator **sub_operators); +CEED_EXTERN int CeedOperatorCompositeGetSubByName(CeedOperator op, const char *op_name, CeedOperator *sub_op); +CEED_EXTERN int CeedOperatorCompositeSetSequential(CeedOperator op, bool is_sequential); +CEED_EXTERN int CeedOperatorCompositeIsSequential(CeedOperator op, bool *is_sequential); CEED_EXTERN int CeedOperatorCheckReady(CeedOperator op); CEED_EXTERN int CeedOperatorGetActiveVectorLengths(CeedOperator op, CeedSize *input_size, CeedSize *output_size); CEED_EXTERN int CeedOperatorSetQFunctionAssemblyReuse(CeedOperator op, bool reuse_assembly_data); @@ -424,8 +462,9 @@ CEED_EXTERN int CeedOperatorLinearAssemblePointBlockDiagonal(CeedOperator op, C CEED_EXTERN int CeedOperatorLinearAssembleAddPointBlockDiagonal(CeedOperator op, CeedVector assembled, CeedRequest *request); CEED_EXTERN int CeedOperatorLinearAssemblePointBlockDiagonalSymbolic(CeedOperator op, CeedSize *num_entries, CeedInt **rows, CeedInt **cols); CEED_EXTERN int CeedOperatorLinearAssembleSymbolic(CeedOperator op, CeedSize *num_entries, CeedInt **rows, CeedInt **cols); +CEED_EXTERN int CeedOperatorLinearAssembleGetNumEntries(CeedOperator op, CeedSize *num_entries); CEED_EXTERN int CeedOperatorLinearAssemble(CeedOperator op, CeedVector values); -CEED_EXTERN int CeedCompositeOperatorGetMultiplicity(CeedOperator op, CeedInt num_skip_indices, CeedInt *skip_indices, CeedVector mult); +CEED_EXTERN int CeedOperatorCompositeGetMultiplicity(CeedOperator op, CeedInt num_skip_indices, CeedInt *skip_indices, CeedVector mult); CEED_EXTERN int CeedOperatorMultigridLevelCreate(CeedOperator op_fine, CeedVector p_mult_fine, CeedElemRestriction rstr_coarse, CeedBasis basis_coarse, CeedOperator *op_coarse, CeedOperator *op_prolong, CeedOperator *op_restrict); @@ -437,7 +476,11 @@ CEED_EXTERN int CeedOperatorMultigridLevelCreateH1(CeedOperator op_fine, CeedVe CeedOperator *op_prolong, CeedOperator *op_restrict); CEED_EXTERN int CeedOperatorCreateFDMElementInverse(CeedOperator op, CeedOperator *fdm_inv, CeedRequest *request); CEED_EXTERN int CeedOperatorSetName(CeedOperator op, const char *name); +CEED_EXTERN int CeedOperatorGetName(CeedOperator op, const char **name); +CEED_EXTERN int CeedOperatorSetNumViewTabs(CeedOperator op, CeedInt num_tabs); +CEED_EXTERN int CeedOperatorGetNumViewTabs(CeedOperator op, CeedInt *num_tabs); CEED_EXTERN int CeedOperatorView(CeedOperator op, FILE *stream); +CEED_EXTERN int CeedOperatorViewTerse(CeedOperator op, FILE *stream); CEED_EXTERN int CeedOperatorGetCeed(CeedOperator op, Ceed *ceed); CEED_EXTERN Ceed CeedOperatorReturnCeed(CeedOperator op); CEED_EXTERN int CeedOperatorGetNumElements(CeedOperator op, CeedInt *num_elem); @@ -456,8 +499,13 @@ CEED_EXTERN int CeedOperatorGetContextBooleanRead(CeedOperator op, CeedContextF CEED_EXTERN int CeedOperatorRestoreContextBooleanRead(CeedOperator op, CeedContextFieldLabel field_label, const bool **values); CEED_EXTERN int CeedOperatorApply(CeedOperator op, CeedVector in, CeedVector out, CeedRequest *request); CEED_EXTERN int CeedOperatorApplyAdd(CeedOperator op, CeedVector in, CeedVector out, CeedRequest *request); +CEED_EXTERN int CeedOperatorApplyAddActive(CeedOperator op, CeedVector in, CeedVector out, CeedRequest *request); +CEED_EXTERN int CeedOperatorAssemblyDataStrip(CeedOperator op); CEED_EXTERN int CeedOperatorDestroy(CeedOperator *op); +// Compatibility with previous composite CeedOperator naming +#include "deprecated.h" + CEED_EXTERN int CeedOperatorGetFieldByName(CeedOperator op, const char *field_name, CeedOperatorField *op_field); CEED_EXTERN int CeedOperatorFieldGetName(CeedOperatorField op_field, const char **field_name); CEED_EXTERN int CeedOperatorFieldGetElemRestriction(CeedOperatorField op_field, CeedElemRestriction *rstr); diff --git a/include/ceed/cuda.h b/include/ceed/cuda.h index 839e64fed7..eb9ac3e9cb 100644 --- a/include/ceed/cuda.h +++ b/include/ceed/cuda.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause diff --git a/include/ceed/deprecated.h b/include/ceed/deprecated.h new file mode 100644 index 0000000000..233b910a60 --- /dev/null +++ b/include/ceed/deprecated.h @@ -0,0 +1,38 @@ +/// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. +/// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +/// +/// SPDX-License-Identifier: BSD-2-Clause +/// +/// This file is part of CEED: http://github.com/ceed + +/// @file +/// Public header for user and utility components of libCEED +#pragma once + +#if __STDC_VERSION__ >= 202311L +#define DEPRECATED(msg) [[deprecated(msg)]] +#elif defined(__GNUC__) || defined(__clang__) +#define DEPRECATED(msg) __attribute__((deprecated(msg))) +#else +#define DEPRECATED(msg) +#endif + +// Compatibility with previous composite CeedOperator naming +DEPRECATED("Use CeedOperatorCreateComposite()") +static inline int CeedCompositeOperatorCreate(Ceed a, CeedOperator *b) { return CeedOperatorCreateComposite(a, b); } +DEPRECATED("Use CeedOperatorCompositeAddSub()") +static inline int CeedCompositeOperatorAddSub(CeedOperator a, CeedOperator b) { return CeedOperatorCompositeAddSub(a, b); } +DEPRECATED("Use CeedOperatorCompositeGetNumSub()") +static inline int CeedCompositeOperatorGetNumSub(CeedOperator a, CeedInt *b) { return CeedOperatorCompositeGetNumSub(a, b); } +DEPRECATED("Use CeedOperatorCompositeGetSubList()") +static inline int CeedCompositeOperatorGetSubList(CeedOperator a, CeedOperator **b) { return CeedOperatorCompositeGetSubList(a, b); } +DEPRECATED("Use CeedOperatorCompositeGetSubByName()") +static inline int CeedCompositeOperatorGetSubByName(CeedOperator a, const char *b, CeedOperator *c) { + return CeedOperatorCompositeGetSubByName(a, b, c); +} +DEPRECATED("Use CeedOperatorCompositeGetMultiplicity()") +static inline int CeedCompositeOperatorGetMultiplicity(CeedOperator a, CeedInt b, CeedInt *c, CeedVector d) { + return CeedOperatorCompositeGetMultiplicity(a, b, c, d); +} + +#undef DEPRECATED diff --git a/include/ceed/fortran.h b/include/ceed/fortran.h index bb7bcac396..ed0c0ef628 100644 --- a/include/ceed/fortran.h +++ b/include/ceed/fortran.h @@ -1,4 +1,4 @@ -! Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +! Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. ! All Rights Reserved. See the top-level LICENSE and NOTICE files for details. ! ! SPDX-License-Identifier: BSD-2-Clause diff --git a/include/ceed/gen-tools.h b/include/ceed/gen-tools.h new file mode 100644 index 0000000000..f1f3743000 --- /dev/null +++ b/include/ceed/gen-tools.h @@ -0,0 +1,27 @@ +#include +#include + +class Tab { + private: + CeedInt _num_tabs{0}; + const CeedInt _width{2}; + + template + friend OStream &operator<<(OStream &os, const Tab &tab); + + public: + Tab &push() { + _num_tabs++; + return *this; + } + Tab &pop() { + if (_num_tabs > 0) _num_tabs--; + return *this; + } +}; + +template +OStream &operator<<(OStream &os, const Tab &tab) { + os << std::string(tab._num_tabs * tab._width, ' '); + return os; +} diff --git a/include/ceed/hip.h b/include/ceed/hip.h index 2c0e156872..86ba7dc098 100644 --- a/include/ceed/hip.h +++ b/include/ceed/hip.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause diff --git a/include/ceed/jit-source/cuda/cuda-atomic-add-fallback.h b/include/ceed/jit-source/cuda/cuda-atomic-add-fallback.h index da92667707..351c3be86c 100644 --- a/include/ceed/jit-source/cuda/cuda-atomic-add-fallback.h +++ b/include/ceed/jit-source/cuda/cuda-atomic-add-fallback.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -7,8 +7,7 @@ /// @file /// Internal header for CUDA atomic add fallback definition - -#include +#include //------------------------------------------------------------------------------ // Atomic add, for older CUDA diff --git a/include/ceed/jit-source/cuda/cuda-gen-templates.h b/include/ceed/jit-source/cuda/cuda-gen-templates.h index f3d7052e3c..5fd998d9e9 100644 --- a/include/ceed/jit-source/cuda/cuda-gen-templates.h +++ b/include/ceed/jit-source/cuda/cuda-gen-templates.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -7,30 +7,74 @@ /// @file /// Internal header for CUDA backend macro and type definitions for JiT source - -#include +#include //------------------------------------------------------------------------------ // Load matrices for basis actions //------------------------------------------------------------------------------ template -inline __device__ void loadMatrix(SharedData_Cuda &data, const CeedScalar *__restrict__ d_B, CeedScalar *B) { +inline __device__ void LoadMatrix(SharedData_Cuda &data, const CeedScalar *__restrict__ d_B, CeedScalar *B) { for (CeedInt i = data.t_id; i < P * Q; i += blockDim.x * blockDim.y * blockDim.z) B[i] = d_B[i]; } +//------------------------------------------------------------------------------ +// AtPoints +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ +// L-vector -> single point +//------------------------------------------------------------------------------ +template +inline __device__ void ReadPoint(SharedData_Cuda &data, const CeedInt elem, const CeedInt p, const CeedInt points_in_elem, + const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { + const CeedInt ind = indices[p + elem * NUM_PTS]; + + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + r_u[comp] = d_u[ind + comp * COMP_STRIDE]; + } +} + +//------------------------------------------------------------------------------ +// Single point -> L-vector +//------------------------------------------------------------------------------ +template +inline __device__ void WritePoint(SharedData_Cuda &data, const CeedInt elem, const CeedInt p, const CeedInt points_in_elem, + const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_u, CeedScalar *d_u) { + if (p < points_in_elem) { + const CeedInt ind = indices[p + elem * NUM_PTS]; + + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + d_u[ind + comp * COMP_STRIDE] += r_u[comp]; + } + } +} + //------------------------------------------------------------------------------ // 1D //------------------------------------------------------------------------------ +//------------------------------------------------------------------------------ +// Set E-vector value +//------------------------------------------------------------------------------ +template +inline __device__ void SetEVecStandard1d_Single(SharedData_Cuda &data, const CeedInt n, const CeedScalar value, CeedScalar *__restrict__ r_v) { + const CeedInt target_comp = n / P_1D; + const CeedInt target_node = n % P_1D; + + if (data.t_id_x == target_node) { + r_v[target_comp] = value; + } +} + //------------------------------------------------------------------------------ // L-vector -> E-vector, offsets provided //------------------------------------------------------------------------------ -template -inline __device__ void readDofsOffset1d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices, - const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) { - if (data.t_id_x < P_1d) { +template +inline __device__ void ReadLVecStandard1d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices, + const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) { + if (data.t_id_x < P_1D) { const CeedInt node = data.t_id_x; - const CeedInt ind = indices[node + elem * P_1d]; + const CeedInt ind = indices[node + elem * P_1D]; for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp]; } @@ -39,10 +83,10 @@ inline __device__ void readDofsOffset1d(SharedData_Cuda &data, const CeedInt num //------------------------------------------------------------------------------ // L-vector -> E-vector, strided //------------------------------------------------------------------------------ -template -inline __device__ void readDofsStrided1d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, +template +inline __device__ void ReadLVecStrided1d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) { - if (data.t_id_x < P_1d) { + if (data.t_id_x < P_1D) { const CeedInt node = data.t_id_x; const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; @@ -53,24 +97,72 @@ inline __device__ void readDofsStrided1d(SharedData_Cuda &data, const CeedInt el //------------------------------------------------------------------------------ // E-vector -> L-vector, offsets provided //------------------------------------------------------------------------------ -template -inline __device__ void writeDofsOffset1d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices, - const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) { - if (data.t_id_x < P_1d) { +template +inline __device__ void WriteLVecStandard1d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices, + const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) { + if (data.t_id_x < P_1D) { const CeedInt node = data.t_id_x; - const CeedInt ind = indices[node + elem * P_1d]; + const CeedInt ind = indices[node + elem * P_1D]; for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[comp]); } } +template +inline __device__ void WriteLVecStandard1d_Single(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt n, + const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_v, + CeedScalar *__restrict__ d_v) { + const CeedInt target_comp = n / P_1D; + const CeedInt target_node = n % P_1D; + + if (data.t_id_x == target_node) { + const CeedInt ind = indices[target_node + elem * P_1D]; + + atomicAdd(&d_v[ind + COMP_STRIDE * target_comp], r_v[target_comp]); + } +} + +//------------------------------------------------------------------------------ +// E-vector -> L-vector, full assembly +//------------------------------------------------------------------------------ +template +inline __device__ void WriteLVecStandard1d_Assembly(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt in, + const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) { + const CeedInt in_comp = in / P_1D; + const CeedInt in_node = in % P_1D; + const CeedInt e_vec_size = P_1D * NUM_COMP; + + if (data.t_id_x < P_1D) { + const CeedInt out_node = data.t_id_x; + + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + d_v[elem * e_vec_size * e_vec_size + (in_comp * NUM_COMP + comp) * P_1D * P_1D + out_node * P_1D + in_node] += r_v[comp]; + } + } +} + +//------------------------------------------------------------------------------ +// E-vector -> L-vector, Qfunction assembly +//------------------------------------------------------------------------------ +template +inline __device__ void WriteLVecStandard1d_QFAssembly(SharedData_Cuda &data, const CeedInt num_elem, const CeedInt elem, const CeedInt input_offset, + const CeedInt output_offset, const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) { + if (data.t_id_x < Q_1D) { + const CeedInt ind = data.t_id_x + elem * Q_1D; + + for (CeedInt comp = 0; comp < NUM_COMP_FIELD; comp++) { + d_v[ind + (input_offset * NUM_COMP_OUT + output_offset + comp) * (Q_1D * num_elem)] = r_v[comp]; + } + } +} + //------------------------------------------------------------------------------ // E-vector -> L-vector, strided //------------------------------------------------------------------------------ -template -inline __device__ void writeDofsStrided1d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ r_v, +template +inline __device__ void WriteLVecStrided1d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) { - if (data.t_id_x < P_1d) { + if (data.t_id_x < P_1D) { const CeedInt node = data.t_id_x; const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; @@ -82,15 +174,29 @@ inline __device__ void writeDofsStrided1d(SharedData_Cuda &data, const CeedInt e // 2D //------------------------------------------------------------------------------ +//------------------------------------------------------------------------------ +// Set E-vector value +//------------------------------------------------------------------------------ +template +inline __device__ void SetEVecStandard2d_Single(SharedData_Cuda &data, const CeedInt n, const CeedScalar value, CeedScalar *__restrict__ r_v) { + const CeedInt target_comp = n / (P_1D * P_1D); + const CeedInt target_node_x = n % P_1D; + const CeedInt target_node_y = (n % (P_1D * P_1D)) / P_1D; + + if (data.t_id_x == target_node_x && data.t_id_y == target_node_y) { + r_v[target_comp] = value; + } +} + //------------------------------------------------------------------------------ // L-vector -> E-vector, offsets provided //------------------------------------------------------------------------------ -template -inline __device__ void readDofsOffset2d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices, - const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) { - if (data.t_id_x < P_1d && data.t_id_y < P_1d) { - const CeedInt node = data.t_id_x + data.t_id_y * P_1d; - const CeedInt ind = indices[node + elem * P_1d * P_1d]; +template +inline __device__ void ReadLVecStandard2d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices, + const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) { + if (data.t_id_x < P_1D && data.t_id_y < P_1D) { + const CeedInt node = data.t_id_x + data.t_id_y * P_1D; + const CeedInt ind = indices[node + elem * P_1D * P_1D]; for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp]; } @@ -99,11 +205,11 @@ inline __device__ void readDofsOffset2d(SharedData_Cuda &data, const CeedInt num //------------------------------------------------------------------------------ // L-vector -> E-vector, strided //------------------------------------------------------------------------------ -template -inline __device__ void readDofsStrided2d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, +template +inline __device__ void ReadLVecStrided2d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) { - if (data.t_id_x < P_1d && data.t_id_y < P_1d) { - const CeedInt node = data.t_id_x + data.t_id_y * P_1d; + if (data.t_id_x < P_1D && data.t_id_y < P_1D) { + const CeedInt node = data.t_id_x + data.t_id_y * P_1D; const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + comp * STRIDES_COMP]; @@ -113,25 +219,80 @@ inline __device__ void readDofsStrided2d(SharedData_Cuda &data, const CeedInt el //------------------------------------------------------------------------------ // E-vector -> L-vector, offsets provided //------------------------------------------------------------------------------ -template -inline __device__ void writeDofsOffset2d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices, - const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) { - if (data.t_id_x < P_1d && data.t_id_y < P_1d) { - const CeedInt node = data.t_id_x + data.t_id_y * P_1d; - const CeedInt ind = indices[node + elem * P_1d * P_1d]; +template +inline __device__ void WriteLVecStandard2d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices, + const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) { + if (data.t_id_x < P_1D && data.t_id_y < P_1D) { + const CeedInt node = data.t_id_x + data.t_id_y * P_1D; + const CeedInt ind = indices[node + elem * P_1D * P_1D]; for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[comp]); } } +template +inline __device__ void WriteLVecStandard2d_Single(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt n, + const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_v, + CeedScalar *__restrict__ d_v) { + const CeedInt target_comp = n / (P_1D * P_1D); + const CeedInt target_node_x = n % P_1D; + const CeedInt target_node_y = (n % (P_1D * P_1D)) / P_1D; + + if (data.t_id_x == target_node_x && data.t_id_y == target_node_y) { + const CeedInt node = data.t_id_x + data.t_id_y * P_1D; + const CeedInt ind = indices[node + elem * P_1D * P_1D]; + + atomicAdd(&d_v[ind + COMP_STRIDE * target_comp], r_v[target_comp]); + } +} + +//------------------------------------------------------------------------------ +// E-vector -> L-vector, full assembly +//------------------------------------------------------------------------------ +template +inline __device__ void WriteLVecStandard2d_Assembly(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt in, + const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) { + const CeedInt elem_size = P_1D * P_1D; + const CeedInt in_comp = in / elem_size; + const CeedInt in_node_x = in % P_1D; + const CeedInt in_node_y = (in % elem_size) / P_1D; + const CeedInt e_vec_size = elem_size * NUM_COMP; + + if (data.t_id_x < P_1D && data.t_id_y < P_1D) { + const CeedInt in_node = in_node_x + in_node_y * P_1D; + const CeedInt out_node = data.t_id_x + data.t_id_y * P_1D; + + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + const CeedInt index = (in_comp * NUM_COMP + comp) * elem_size * elem_size + out_node * elem_size + in_node; + + d_v[elem * e_vec_size * e_vec_size + index] += r_v[comp]; + } + } +} + +//------------------------------------------------------------------------------ +// E-vector -> L-vector, Qfunction assembly +//------------------------------------------------------------------------------ +template +inline __device__ void WriteLVecStandard2d_QFAssembly(SharedData_Cuda &data, const CeedInt num_elem, const CeedInt elem, const CeedInt input_offset, + const CeedInt output_offset, const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) { + if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { + const CeedInt ind = (data.t_id_x + data.t_id_y * Q_1D) + elem * Q_1D * Q_1D; + + for (CeedInt comp = 0; comp < NUM_COMP_FIELD; comp++) { + d_v[ind + (input_offset * NUM_COMP_OUT + output_offset + comp) * (Q_1D * Q_1D * num_elem)] = r_v[comp]; + } + } +} + //------------------------------------------------------------------------------ // E-vector -> L-vector, strided //------------------------------------------------------------------------------ -template -inline __device__ void writeDofsStrided2d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ r_v, +template +inline __device__ void WriteLVecStrided2d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) { - if (data.t_id_x < P_1d && data.t_id_y < P_1d) { - const CeedInt node = data.t_id_x + data.t_id_y * P_1d; + if (data.t_id_x < P_1D && data.t_id_y < P_1D) { + const CeedInt node = data.t_id_x + data.t_id_y * P_1D; const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[comp]; @@ -142,52 +303,63 @@ inline __device__ void writeDofsStrided2d(SharedData_Cuda &data, const CeedInt e // 3D //------------------------------------------------------------------------------ +//------------------------------------------------------------------------------ +// Set E-vector value +//------------------------------------------------------------------------------ +template +inline __device__ void SetEVecStandard3d_Single(SharedData_Cuda &data, const CeedInt n, const CeedScalar value, CeedScalar *__restrict__ r_v) { + const CeedInt target_comp = n / (P_1D * P_1D * P_1D); + const CeedInt target_node_x = n % P_1D; + const CeedInt target_node_y = (n % (P_1D * P_1D)) / P_1D; + const CeedInt target_node_z = (n % (P_1D * P_1D * P_1D)) / (P_1D * P_1D); + + if (data.t_id_x == target_node_x && data.t_id_y == target_node_y) { + r_v[target_node_z + target_comp * P_1D] = value; + } +} + //------------------------------------------------------------------------------ // L-vector -> E-vector, offsets provided //------------------------------------------------------------------------------ -// TODO: remove "Dofs" and "Quads" in the following function names? -// - readDofsOffset3d -> readOffset3d ? -// - readDofsStrided3d -> readStrided3d ? -// - readSliceQuadsOffset3d -> readSliceOffset3d ? -// - readSliceQuadsStrided3d -> readSliceStrided3d ? -// - writeDofsOffset3d -> writeOffset3d ? -// - writeDofsStrided3d -> writeStrided3d ? -template -inline __device__ void readDofsOffset3d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices, - const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) { - if (data.t_id_x < P_1d && data.t_id_y < P_1d) - for (CeedInt z = 0; z < P_1d; z++) { - const CeedInt node = data.t_id_x + data.t_id_y * P_1d + z * P_1d * P_1d; - const CeedInt ind = indices[node + elem * P_1d * P_1d * P_1d]; - - for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[z + comp * P_1d] = d_u[ind + COMP_STRIDE * comp]; +template +inline __device__ void ReadLVecStandard3d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices, + const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) { + if (data.t_id_x < P_1D && data.t_id_y < P_1D) { + for (CeedInt z = 0; z < P_1D; z++) { + const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D; + const CeedInt ind = indices[node + elem * P_1D * P_1D * P_1D]; + + for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[z + comp * P_1D] = d_u[ind + COMP_STRIDE * comp]; } + } } //------------------------------------------------------------------------------ // L-vector -> E-vector, strided //------------------------------------------------------------------------------ -template -inline __device__ void readDofsStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, +template +inline __device__ void ReadLVecStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) { - if (data.t_id_x < P_1d && data.t_id_y < P_1d) - for (CeedInt z = 0; z < P_1d; z++) { - const CeedInt node = data.t_id_x + data.t_id_y * P_1d + z * P_1d * P_1d; + if (data.t_id_x < P_1D && data.t_id_y < P_1D) { + for (CeedInt z = 0; z < P_1D; z++) { + const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D; const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; - for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[z + comp * P_1d] = d_u[ind + comp * STRIDES_COMP]; + for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[z + comp * P_1D] = d_u[ind + comp * STRIDES_COMP]; } + } } //------------------------------------------------------------------------------ // E-vector -> Q-vector, offests provided //------------------------------------------------------------------------------ -template -inline __device__ void readSliceQuadsOffset3d(SharedData_Cuda &data, const CeedInt nquads, const CeedInt elem, const CeedInt q, - const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) { - if (data.t_id_x < Q_1d && data.t_id_y < Q_1d) { - const CeedInt node = data.t_id_x + data.t_id_y * Q_1d + q * Q_1d * Q_1d; - const CeedInt ind = indices[node + elem * Q_1d * Q_1d * Q_1d]; +template +inline __device__ void ReadEVecSliceStandard3d(SharedData_Cuda &data, const CeedInt nquads, const CeedInt elem, const CeedInt q, + const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, + CeedScalar *__restrict__ r_u) { + if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { + const CeedInt node = data.t_id_x + data.t_id_y * Q_1D + q * Q_1D * Q_1D; + const CeedInt ind = indices[node + elem * Q_1D * Q_1D * Q_1D]; for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp]; } @@ -196,11 +368,11 @@ inline __device__ void readSliceQuadsOffset3d(SharedData_Cuda &data, const CeedI //------------------------------------------------------------------------------ // E-vector -> Q-vector, strided //------------------------------------------------------------------------------ -template -inline __device__ void readSliceQuadsStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedInt q, const CeedScalar *__restrict__ d_u, - CeedScalar *__restrict__ r_u) { - if (data.t_id_x < Q_1d && data.t_id_y < Q_1d) { - const CeedInt node = data.t_id_x + data.t_id_y * Q_1d + q * Q_1d * Q_1d; +template +inline __device__ void ReadEVecSliceStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedInt q, const CeedScalar *__restrict__ d_u, + CeedScalar *__restrict__ r_u) { + if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { + const CeedInt node = data.t_id_x + data.t_id_y * Q_1D + q * Q_1D * Q_1D; const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + comp * STRIDES_COMP]; @@ -210,55 +382,122 @@ inline __device__ void readSliceQuadsStrided3d(SharedData_Cuda &data, const Ceed //------------------------------------------------------------------------------ // E-vector -> L-vector, offsets provided //------------------------------------------------------------------------------ -template -inline __device__ void writeDofsOffset3d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices, - const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) { - if (data.t_id_x < P_1d && data.t_id_y < P_1d) - for (CeedInt z = 0; z < P_1d; z++) { - const CeedInt node = data.t_id_x + data.t_id_y * P_1d + z * P_1d * P_1d; - const CeedInt ind = indices[node + elem * P_1d * P_1d * P_1d]; +template +inline __device__ void WriteLVecStandard3d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices, + const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) { + if (data.t_id_x < P_1D && data.t_id_y < P_1D) { + for (CeedInt z = 0; z < P_1D; z++) { + const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D; + const CeedInt ind = indices[node + elem * P_1D * P_1D * P_1D]; - for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[z + comp * P_1d]); + for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[z + comp * P_1D]); } + } +} + +template +inline __device__ void WriteLVecStandard3d_Single(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt n, + const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_v, + CeedScalar *__restrict__ d_v) { + const CeedInt target_comp = n / (P_1D * P_1D * P_1D); + const CeedInt target_node_x = n % P_1D; + const CeedInt target_node_y = (n % (P_1D * P_1D)) / P_1D; + const CeedInt target_node_z = (n % (P_1D * P_1D * P_1D)) / (P_1D * P_1D); + + if (data.t_id_x == target_node_x && data.t_id_y == target_node_y) { + const CeedInt node = data.t_id_x + data.t_id_y * P_1D + target_node_z * P_1D * P_1D; + const CeedInt ind = indices[node + elem * P_1D * P_1D * P_1D]; + + atomicAdd(&d_v[ind + COMP_STRIDE * target_comp], r_v[target_node_z + target_comp * P_1D]); + } +} + +//------------------------------------------------------------------------------ +// E-vector -> L-vector, full assembly +//------------------------------------------------------------------------------ +template +inline __device__ void WriteLVecStandard3d_Assembly(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt in, + const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) { + const CeedInt elem_size = P_1D * P_1D * P_1D; + const CeedInt in_comp = in / elem_size; + const CeedInt in_node_x = in % P_1D; + const CeedInt in_node_y = (in % (P_1D * P_1D)) / P_1D; + const CeedInt in_node_z = (in % elem_size) / (P_1D * P_1D); + const CeedInt e_vec_size = elem_size * NUM_COMP; + + if (data.t_id_x < P_1D && data.t_id_y < P_1D) { + const CeedInt in_node = in_node_x + in_node_y * P_1D + in_node_z * P_1D * P_1D; + for (CeedInt z = 0; z < P_1D; z++) { + const CeedInt out_node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D; + + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + const CeedInt index = (in_comp * NUM_COMP + comp) * elem_size * elem_size + out_node * elem_size + in_node; + + d_v[elem * e_vec_size * e_vec_size + index] += r_v[z + comp * P_1D]; + } + } + } +} + +//------------------------------------------------------------------------------ +// E-vector -> L-vector, Qfunction assembly +//------------------------------------------------------------------------------ +template +inline __device__ void WriteLVecStandard3d_QFAssembly(SharedData_Cuda &data, const CeedInt num_elem, const CeedInt elem, const CeedInt input_offset, + const CeedInt output_offset, const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) { + if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { + for (CeedInt z = 0; z < Q_1D; z++) { + const CeedInt ind = (data.t_id_x + data.t_id_y * Q_1D + z * Q_1D * Q_1D) + elem * Q_1D * Q_1D * Q_1D; + + for (CeedInt comp = 0; comp < NUM_COMP_FIELD; comp++) { + d_v[ind + (input_offset * NUM_COMP_OUT + output_offset + comp) * (Q_1D * Q_1D * Q_1D * num_elem)] = r_v[z + comp * Q_1D]; + } + } + } } //------------------------------------------------------------------------------ // E-vector -> L-vector, strided //------------------------------------------------------------------------------ -template -inline __device__ void writeDofsStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ r_v, +template +inline __device__ void WriteLVecStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) { - if (data.t_id_x < P_1d && data.t_id_y < P_1d) - for (CeedInt z = 0; z < P_1d; z++) { - const CeedInt node = data.t_id_x + data.t_id_y * P_1d + z * P_1d * P_1d; + if (data.t_id_x < P_1D && data.t_id_y < P_1D) { + for (CeedInt z = 0; z < P_1D; z++) { + const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D; const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; - for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[z + comp * P_1d]; + for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[z + comp * P_1D]; } + } } //------------------------------------------------------------------------------ // 3D collocated derivatives computation //------------------------------------------------------------------------------ -template -inline __device__ void gradCollo3d(SharedData_Cuda &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, - CeedScalar *__restrict__ r_V) { - if (data.t_id_x < Q_1d && data.t_id_y < Q_1d) { +template +inline __device__ void GradColloSlice3d(SharedData_Cuda &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, + CeedScalar *__restrict__ r_V) { + if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { for (CeedInt comp = 0; comp < NUM_COMP; comp++) { - data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[q + comp * Q_1d]; + __syncthreads(); + data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[q + comp * Q_1D]; __syncthreads(); // X derivative r_V[comp + 0 * NUM_COMP] = 0.0; - for (CeedInt i = 0; i < Q_1d; i++) - r_V[comp + 0 * NUM_COMP] += c_G[i + data.t_id_x * Q_1d] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction (X derivative) + for (CeedInt i = 0; i < Q_1D; i++) { + r_V[comp + 0 * NUM_COMP] += c_G[i + data.t_id_x * Q_1D] * data.slice[i + data.t_id_y * T_1D]; + } // Y derivative r_V[comp + 1 * NUM_COMP] = 0.0; - for (CeedInt i = 0; i < Q_1d; i++) - r_V[comp + 1 * NUM_COMP] += c_G[i + data.t_id_y * Q_1d] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction (Y derivative) + for (CeedInt i = 0; i < Q_1D; i++) { + r_V[comp + 1 * NUM_COMP] += c_G[i + data.t_id_y * Q_1D] * data.slice[data.t_id_x + i * T_1D]; + } // Z derivative r_V[comp + 2 * NUM_COMP] = 0.0; - for (CeedInt i = 0; i < Q_1d; i++) r_V[comp + 2 * NUM_COMP] += c_G[i + q * Q_1d] * r_U[i + comp * Q_1d]; // Contract z direction (Z derivative) - __syncthreads(); + for (CeedInt i = 0; i < Q_1D; i++) { + r_V[comp + 2 * NUM_COMP] += c_G[i + q * Q_1D] * r_U[i + comp * Q_1D]; + } } } } @@ -266,26 +505,29 @@ inline __device__ void gradCollo3d(SharedData_Cuda &data, const CeedInt q, const //------------------------------------------------------------------------------ // 3D collocated derivatives transpose //------------------------------------------------------------------------------ -template -inline __device__ void gradColloTranspose3d(SharedData_Cuda &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, - CeedScalar *__restrict__ r_V) { - if (data.t_id_x < Q_1d && data.t_id_y < Q_1d) { +template +inline __device__ void GradColloSliceTranspose3d(SharedData_Cuda &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, + CeedScalar *__restrict__ r_V) { + if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { for (CeedInt comp = 0; comp < NUM_COMP; comp++) { - // X derivative - data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 0 * NUM_COMP]; __syncthreads(); - for (CeedInt i = 0; i < Q_1d; i++) - r_V[q + comp * Q_1d] += c_G[data.t_id_x + i * Q_1d] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction (X derivative) + data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 0 * NUM_COMP]; __syncthreads(); + // X derivative + for (CeedInt i = 0; i < Q_1D; i++) { + r_V[q + comp * Q_1D] += c_G[data.t_id_x + i * Q_1D] * data.slice[i + data.t_id_y * T_1D]; + } // Y derivative - data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 1 * NUM_COMP]; __syncthreads(); - for (CeedInt i = 0; i < Q_1d; i++) - r_V[q + comp * Q_1d] += c_G[data.t_id_y + i * Q_1d] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction (Y derivative) + data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 1 * NUM_COMP]; __syncthreads(); + for (CeedInt i = 0; i < Q_1D; i++) { + r_V[q + comp * Q_1D] += c_G[data.t_id_y + i * Q_1D] * data.slice[data.t_id_x + i * T_1D]; + } // Z derivative - for (CeedInt i = 0; i < Q_1d; i++) - r_V[i + comp * Q_1d] += c_G[i + q * Q_1d] * r_U[comp + 2 * NUM_COMP]; // PARTIAL contract z direction (Z derivative) + for (CeedInt i = 0; i < Q_1D; i++) { + r_V[i + comp * Q_1D] += c_G[i + q * Q_1D] * r_U[comp + 2 * NUM_COMP]; + } } } } diff --git a/include/ceed/jit-source/cuda/cuda-jit.h b/include/ceed/jit-source/cuda/cuda-jit.h index 1aedb54dbe..d9cd5a8963 100644 --- a/include/ceed/jit-source/cuda/cuda-jit.h +++ b/include/ceed/jit-source/cuda/cuda-jit.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -13,4 +13,8 @@ #define CeedPragmaSIMD #define CEED_Q_VLA 1 +#define CEED_QFUNCTION_RUST(name) \ + extern "C" __device__ int name##_rs(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out); \ + static __device__ int name(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { return name##_rs(ctx, Q, in, out); } + #include "cuda-types.h" diff --git a/include/ceed/jit-source/cuda/cuda-ref-basis-nontensor-templates.h b/include/ceed/jit-source/cuda/cuda-ref-basis-nontensor-templates.h index 64b57d0d68..2a4967f807 100644 --- a/include/ceed/jit-source/cuda/cuda-ref-basis-nontensor-templates.h +++ b/include/ceed/jit-source/cuda/cuda-ref-basis-nontensor-templates.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -7,8 +7,7 @@ /// @file /// Internal header for CUDA non-tensor product basis templates - -#include +#include //------------------------------------------------------------------------------ // Tensor contraction @@ -53,9 +52,9 @@ inline __device__ void ContractTranspose(const CeedInt elem, const CeedInt strid // Run with P threads r_V = 0.0; for (CeedInt d = 0; d < Q_COMP; d++) { - U = d_U + elem * strides_elem_U + comp * strides_comp_U + d * strides_q_comp_U; + U = &d_U[elem * strides_elem_U + comp * strides_comp_U + d * strides_q_comp_U]; for (CeedInt i = 0; i < Q; i++) r_V += d_B[t_id + i * P + d * P * Q] * U[i]; } - d_V[elem * strides_elem_V + comp * strides_comp_V + t_id] = r_V; + d_V[elem * strides_elem_V + comp * strides_comp_V + t_id] += r_V; } } diff --git a/include/ceed/jit-source/cuda/cuda-ref-basis-nontensor.h b/include/ceed/jit-source/cuda/cuda-ref-basis-nontensor.h index 6dbf8771d8..c441e414ef 100644 --- a/include/ceed/jit-source/cuda/cuda-ref-basis-nontensor.h +++ b/include/ceed/jit-source/cuda/cuda-ref-basis-nontensor.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -7,8 +7,7 @@ /// @file /// Internal header for CUDA non-tensor product basis - -#include +#include #include "cuda-ref-basis-nontensor-templates.h" diff --git a/include/ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h b/include/ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h new file mode 100644 index 0000000000..602a6d1f40 --- /dev/null +++ b/include/ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h @@ -0,0 +1,408 @@ +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. +// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +// +// SPDX-License-Identifier: BSD-2-Clause +// +// This file is part of CEED: http://github.com/ceed + +/// @file +/// Internal header for CUDA tensor product basis with AtPoints evaluation +#include + +//------------------------------------------------------------------------------ +// Chebyshev values +//------------------------------------------------------------------------------ +template +inline __device__ void ChebyshevPolynomialsAtPoint(const CeedScalar x, CeedScalar *chebyshev_x) { + chebyshev_x[0] = 1.0; + chebyshev_x[1] = 2 * x; + for (CeedInt i = 2; i < Q_1D; i++) chebyshev_x[i] = 2 * x * chebyshev_x[i - 1] - chebyshev_x[i - 2]; +} + +template +inline __device__ void ChebyshevDerivativeAtPoint(const CeedScalar x, CeedScalar *chebyshev_dx) { + CeedScalar chebyshev_x[3]; + + chebyshev_x[1] = 1.0; + chebyshev_x[2] = 2 * x; + chebyshev_dx[0] = 0.0; + chebyshev_dx[1] = 2.0; + for (CeedInt i = 2; i < Q_1D; i++) { + chebyshev_x[(i + 1) % 3] = 2 * x * chebyshev_x[(i + 0) % 3] - chebyshev_x[(i + 2) % 3]; + chebyshev_dx[i] = 2 * x * chebyshev_dx[i - 1] + 2 * chebyshev_x[(i + 0) % 3] - chebyshev_dx[i - 2]; + } +} + +//------------------------------------------------------------------------------ +// Tensor Basis Kernels AtPoints +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ +// Interp +//------------------------------------------------------------------------------ +extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ chebyshev_interp_1d, + const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ coords, + const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { + const CeedInt i = threadIdx.x; + + __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 2 * BASIS_BUF_LEN + POINTS_BUFF_LEN * BASIS_Q_1D]; + CeedScalar *s_chebyshev_interp_1d = s_mem; + CeedScalar *s_buffer_1 = s_mem + BASIS_Q_1D * BASIS_P_1D; + CeedScalar *s_buffer_2 = s_buffer_1 + BASIS_BUF_LEN; + CeedScalar *s_chebyshev_coeffs = s_buffer_2 + BASIS_BUF_LEN; + CeedScalar chebyshev_x[BASIS_Q_1D], buffer_1[POINTS_BUFF_LEN], buffer_2[POINTS_BUFF_LEN]; + for (CeedInt k = i; k < BASIS_Q_1D * BASIS_P_1D; k += blockDim.x) { + s_chebyshev_interp_1d[k] = chebyshev_interp_1d[k]; + } + + const CeedInt P = BASIS_P_1D; + const CeedInt Q = BASIS_Q_1D; + const CeedInt u_stride = BASIS_NUM_NODES; + const CeedInt v_stride = BASIS_NUM_PTS; + const CeedInt u_comp_stride = num_elem * BASIS_NUM_NODES; + const CeedInt v_comp_stride = num_elem * BASIS_NUM_PTS; + const CeedInt u_size = BASIS_NUM_NODES; + + // Apply basis element by element + for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) { + for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) { + const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride]; + CeedScalar *cur_v = &v[elem * v_stride + comp * v_comp_stride]; + CeedInt pre = u_size; + CeedInt post = 1; + + // Map to coefficients + for (CeedInt d = 0; d < BASIS_DIM; d++) { + __syncthreads(); + // Update buffers used + pre /= P; + const CeedScalar *in = d == 0 ? cur_u : (d % 2 ? s_buffer_2 : s_buffer_1); + CeedScalar *out = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_1 : s_buffer_2); + const CeedInt writeLen = pre * post * Q; + + // Contract along middle index + for (CeedInt k = i; k < writeLen; k += blockDim.x) { + const CeedInt c = k % post; + const CeedInt j = (k / post) % Q; + const CeedInt a = k / (post * Q); + CeedScalar v_k = 0; + + for (CeedInt b = 0; b < P; b++) v_k += s_chebyshev_interp_1d[j * BASIS_P_1D + b] * in[(a * P + b) * post + c]; + out[k] = v_k; + } + post *= Q; + } + + // Map to point + __syncthreads(); + for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) { + pre = BASIS_NUM_QPTS; + post = 1; + for (CeedInt d = 0; d < BASIS_DIM; d++) { + // Update buffers used + pre /= Q; + const CeedScalar *in = d == 0 ? s_chebyshev_coeffs : (d % 2 ? buffer_2 : buffer_1); + CeedScalar *out = d == BASIS_DIM - 1 ? (&cur_v[p]) : (d % 2 ? buffer_1 : buffer_2); + + // Build Chebyshev polynomial values + ChebyshevPolynomialsAtPoint(coords[elem * v_stride + d * v_comp_stride + p], chebyshev_x); + + // Contract along middle index + for (CeedInt a = 0; a < pre; a++) { + for (CeedInt c = 0; c < post; c++) { + CeedScalar v_k = 0; + + for (CeedInt b = 0; b < Q; b++) v_k += chebyshev_x[b] * in[(a * Q + b) * post + c]; + out[a * post + c] = v_k; + } + } + post *= 1; + } + } + } + } +} + +extern "C" __global__ void InterpTransposeAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ chebyshev_interp_1d, + const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ coords, + const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { + const CeedInt i = threadIdx.x; + + __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 2 * BASIS_BUF_LEN + POINTS_BUFF_LEN * BASIS_Q_1D]; + CeedScalar *s_chebyshev_interp_1d = s_mem; + CeedScalar *s_buffer_1 = s_mem + BASIS_Q_1D * BASIS_P_1D; + CeedScalar *s_buffer_2 = s_buffer_1 + BASIS_BUF_LEN; + CeedScalar *s_chebyshev_coeffs = s_buffer_2 + BASIS_BUF_LEN; + CeedScalar chebyshev_x[BASIS_Q_1D], buffer_1[POINTS_BUFF_LEN], buffer_2[POINTS_BUFF_LEN]; + for (CeedInt k = i; k < BASIS_Q_1D * BASIS_P_1D; k += blockDim.x) { + s_chebyshev_interp_1d[k] = chebyshev_interp_1d[k]; + } + + const CeedInt P = BASIS_P_1D; + const CeedInt Q = BASIS_Q_1D; + const CeedInt u_stride = BASIS_NUM_PTS; + const CeedInt v_stride = BASIS_NUM_NODES; + const CeedInt u_comp_stride = num_elem * BASIS_NUM_PTS; + const CeedInt v_comp_stride = num_elem * BASIS_NUM_NODES; + const CeedInt u_size = BASIS_NUM_PTS; + + // Apply basis element by element + for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) { + for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) { + const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride]; + CeedScalar *cur_v = &v[elem * v_stride + comp * v_comp_stride]; + CeedInt pre = 1; + CeedInt post = 1; + + // Clear Chebyshev coeffs + for (CeedInt k = i; k < BASIS_NUM_QPTS; k += blockDim.x) { + s_chebyshev_coeffs[k] = 0.0; + } + + // Map from point + __syncthreads(); + for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) { + if (p >= points_per_elem[elem]) continue; + pre = 1; + post = 1; + for (CeedInt d = 0; d < BASIS_DIM; d++) { + // Update buffers used + pre /= 1; + const CeedScalar *in = d == 0 ? (&cur_u[p]) : (d % 2 ? buffer_2 : buffer_1); + CeedScalar *out = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? buffer_1 : buffer_2); + + // Build Chebyshev polynomial values + ChebyshevPolynomialsAtPoint(coords[elem * u_stride + d * u_comp_stride + p], chebyshev_x); + + // Contract along middle index + for (CeedInt a = 0; a < pre; a++) { + for (CeedInt c = 0; c < post; c++) { + if (d == BASIS_DIM - 1) { + for (CeedInt j = 0; j < Q; j++) atomicAdd(&out[(a * Q + (j + p) % Q) * post + c], chebyshev_x[(j + p) % Q] * in[a * post + c]); + } else { + for (CeedInt j = 0; j < Q; j++) out[(a * Q + j) * post + c] = chebyshev_x[j] * in[a * post + c]; + } + } + } + post *= Q; + } + } + + // Map from coefficients + pre = BASIS_NUM_QPTS; + post = 1; + for (CeedInt d = 0; d < BASIS_DIM; d++) { + __syncthreads(); + // Update buffers used + pre /= Q; + const CeedScalar *in = d == 0 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_2 : s_buffer_1); + CeedScalar *out = d == BASIS_DIM - 1 ? cur_v : (d % 2 ? s_buffer_1 : s_buffer_2); + const CeedInt writeLen = pre * post * P; + + // Contract along middle index + for (CeedInt k = i; k < writeLen; k += blockDim.x) { + const CeedInt c = k % post; + const CeedInt j = (k / post) % P; + const CeedInt a = k / (post * P); + CeedScalar v_k = 0; + + for (CeedInt b = 0; b < Q; b++) v_k += s_chebyshev_interp_1d[j + b * BASIS_P_1D] * in[(a * Q + b) * post + c]; + if (d == BASIS_DIM - 1) out[k] += v_k; + else out[k] = v_k; + } + post *= P; + } + } + } +} + +//------------------------------------------------------------------------------ +// Grad +//------------------------------------------------------------------------------ +extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ chebyshev_interp_1d, + const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ coords, + const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { + const CeedInt i = threadIdx.x; + + __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 2 * BASIS_BUF_LEN + POINTS_BUFF_LEN * BASIS_Q_1D]; + CeedScalar *s_chebyshev_interp_1d = s_mem; + CeedScalar *s_buffer_1 = s_mem + BASIS_Q_1D * BASIS_P_1D; + CeedScalar *s_buffer_2 = s_buffer_1 + BASIS_BUF_LEN; + CeedScalar *s_chebyshev_coeffs = s_buffer_2 + BASIS_BUF_LEN; + CeedScalar chebyshev_x[BASIS_Q_1D], buffer_1[POINTS_BUFF_LEN], buffer_2[POINTS_BUFF_LEN]; + for (CeedInt k = i; k < BASIS_Q_1D * BASIS_P_1D; k += blockDim.x) { + s_chebyshev_interp_1d[k] = chebyshev_interp_1d[k]; + } + + const CeedInt P = BASIS_P_1D; + const CeedInt Q = BASIS_Q_1D; + const CeedInt u_stride = BASIS_NUM_NODES; + const CeedInt v_stride = BASIS_NUM_PTS; + const CeedInt u_comp_stride = num_elem * BASIS_NUM_NODES; + const CeedInt v_comp_stride = num_elem * BASIS_NUM_PTS; + const CeedInt u_size = BASIS_NUM_NODES; + const CeedInt u_dim_stride = 0; + const CeedInt v_dim_stride = num_elem * BASIS_NUM_PTS * BASIS_NUM_COMP; + + // Apply basis element by element + for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) { + for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) { + const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride]; + CeedInt pre = u_size; + CeedInt post = 1; + + // Map to coefficients + for (CeedInt d = 0; d < BASIS_DIM; d++) { + __syncthreads(); + // Update buffers used + pre /= P; + const CeedScalar *in = d == 0 ? cur_u : (d % 2 ? s_buffer_2 : s_buffer_1); + CeedScalar *out = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_1 : s_buffer_2); + const CeedInt writeLen = pre * post * Q; + + // Contract along middle index + for (CeedInt k = i; k < writeLen; k += blockDim.x) { + const CeedInt c = k % post; + const CeedInt j = (k / post) % Q; + const CeedInt a = k / (post * Q); + CeedScalar v_k = 0; + + for (CeedInt b = 0; b < P; b++) v_k += s_chebyshev_interp_1d[j * BASIS_P_1D + b] * in[(a * P + b) * post + c]; + out[k] = v_k; + } + post *= Q; + } + + // Map to point + __syncthreads(); + for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) { + for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) { + CeedScalar *cur_v = &v[elem * v_stride + dim_1 * v_dim_stride + comp * v_comp_stride]; + + pre = BASIS_NUM_QPTS; + post = 1; + for (CeedInt dim_2 = 0; dim_2 < BASIS_DIM; dim_2++) { + // Update buffers used + pre /= Q; + const CeedScalar *in = dim_2 == 0 ? s_chebyshev_coeffs : (dim_2 % 2 ? buffer_2 : buffer_1); + CeedScalar *out = dim_2 == BASIS_DIM - 1 ? (cur_v + p) : (dim_2 % 2 ? buffer_1 : buffer_2); + + // Build Chebyshev polynomial values + if (dim_1 == dim_2) ChebyshevDerivativeAtPoint(coords[elem * v_stride + dim_2 * v_comp_stride + p], chebyshev_x); + else ChebyshevPolynomialsAtPoint(coords[elem * v_stride + dim_2 * v_comp_stride + p], chebyshev_x); + + // Contract along middle index + for (CeedInt a = 0; a < pre; a++) { + for (CeedInt c = 0; c < post; c++) { + CeedScalar v_k = 0; + + for (CeedInt b = 0; b < Q; b++) v_k += chebyshev_x[b] * in[(a * Q + b) * post + c]; + out[a * post + c] = v_k; + } + } + post *= 1; + } + } + } + } + } +} + +extern "C" __global__ void GradTransposeAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ chebyshev_interp_1d, + const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ coords, + const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { + const CeedInt i = threadIdx.x; + + __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 2 * BASIS_BUF_LEN + POINTS_BUFF_LEN * BASIS_Q_1D]; + CeedScalar *s_chebyshev_interp_1d = s_mem; + CeedScalar *s_buffer_1 = s_mem + BASIS_Q_1D * BASIS_P_1D; + CeedScalar *s_buffer_2 = s_buffer_1 + BASIS_BUF_LEN; + CeedScalar *s_chebyshev_coeffs = s_buffer_2 + BASIS_BUF_LEN; + CeedScalar chebyshev_x[BASIS_Q_1D], buffer_1[POINTS_BUFF_LEN], buffer_2[POINTS_BUFF_LEN]; + for (CeedInt k = i; k < BASIS_Q_1D * BASIS_P_1D; k += blockDim.x) { + s_chebyshev_interp_1d[k] = chebyshev_interp_1d[k]; + } + + const CeedInt P = BASIS_P_1D; + const CeedInt Q = BASIS_Q_1D; + const CeedInt u_stride = BASIS_NUM_PTS; + const CeedInt v_stride = BASIS_NUM_NODES; + const CeedInt u_comp_stride = num_elem * BASIS_NUM_PTS; + const CeedInt v_comp_stride = num_elem * BASIS_NUM_NODES; + const CeedInt u_size = BASIS_NUM_PTS; + const CeedInt u_dim_stride = num_elem * BASIS_NUM_PTS * BASIS_NUM_COMP; + const CeedInt v_dim_stride = 0; + + // Apply basis element by element + for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) { + for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) { + CeedScalar *cur_v = &v[elem * v_stride + comp * v_comp_stride]; + CeedInt pre = 1; + CeedInt post = 1; + + // Clear Chebyshev coeffs + for (CeedInt k = i; k < BASIS_NUM_QPTS; k += blockDim.x) { + s_chebyshev_coeffs[k] = 0.0; + } + + // Map from point + __syncthreads(); + for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) { + if (p >= points_per_elem[elem]) continue; + for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) { + const CeedScalar *cur_u = &u[elem * u_stride + dim_1 * u_dim_stride + comp * u_comp_stride]; + + pre = 1; + post = 1; + for (CeedInt dim_2 = 0; dim_2 < BASIS_DIM; dim_2++) { + // Update buffers used + pre /= 1; + const CeedScalar *in = dim_2 == 0 ? (cur_u + p) : (dim_2 % 2 ? buffer_2 : buffer_1); + CeedScalar *out = dim_2 == BASIS_DIM - 1 ? s_chebyshev_coeffs : (dim_2 % 2 ? buffer_1 : buffer_2); + + // Build Chebyshev polynomial values + if (dim_1 == dim_2) ChebyshevDerivativeAtPoint(coords[elem * u_stride + dim_2 * u_comp_stride + p], chebyshev_x); + else ChebyshevPolynomialsAtPoint(coords[elem * u_stride + dim_2 * u_comp_stride + p], chebyshev_x); + + // Contract along middle index + for (CeedInt a = 0; a < pre; a++) { + for (CeedInt c = 0; c < post; c++) { + if (dim_2 == BASIS_DIM - 1) { + for (CeedInt j = 0; j < Q; j++) atomicAdd(&out[(a * Q + (j + p) % Q) * post + c], chebyshev_x[(j + p) % Q] * in[a * post + c]); + } else { + for (CeedInt j = 0; j < Q; j++) out[(a * Q + j) * post + c] = chebyshev_x[j] * in[a * post + c]; + } + } + } + post *= Q; + } + } + } + + // Map from coefficients + pre = BASIS_NUM_QPTS; + post = 1; + for (CeedInt d = 0; d < BASIS_DIM; d++) { + __syncthreads(); + // Update buffers used + pre /= Q; + const CeedScalar *in = d == 0 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_2 : s_buffer_1); + CeedScalar *out = d == BASIS_DIM - 1 ? cur_v : (d % 2 ? s_buffer_1 : s_buffer_2); + const CeedInt writeLen = pre * post * P; + + // Contract along middle index + for (CeedInt k = i; k < writeLen; k += blockDim.x) { + const CeedInt c = k % post; + const CeedInt j = (k / post) % P; + const CeedInt a = k / (post * P); + CeedScalar v_k = 0; + + for (CeedInt b = 0; b < Q; b++) v_k += s_chebyshev_interp_1d[j + b * BASIS_P_1D] * in[(a * Q + b) * post + c]; + if (d == BASIS_DIM - 1) out[k] += v_k; + else out[k] = v_k; + } + post *= P; + } + } + } +} diff --git a/include/ceed/jit-source/cuda/cuda-ref-basis-tensor.h b/include/ceed/jit-source/cuda/cuda-ref-basis-tensor.h index 7361c994e0..baa8554eda 100644 --- a/include/ceed/jit-source/cuda/cuda-ref-basis-tensor.h +++ b/include/ceed/jit-source/cuda/cuda-ref-basis-tensor.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -7,8 +7,7 @@ /// @file /// Internal header for CUDA tensor product basis - -#include +#include //------------------------------------------------------------------------------ // Tensor Basis Kernels @@ -17,7 +16,7 @@ //------------------------------------------------------------------------------ // Interp //------------------------------------------------------------------------------ -extern "C" __global__ void Interp(const CeedInt num_elem, const CeedInt transpose, const CeedScalar *__restrict__ interp_1d, +extern "C" __global__ void Interp(const CeedInt num_elem, const CeedInt is_transpose, const CeedScalar *__restrict__ interp_1d, const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { const CeedInt i = threadIdx.x; @@ -29,44 +28,42 @@ extern "C" __global__ void Interp(const CeedInt num_elem, const CeedInt transpos s_interp_1d[k] = interp_1d[k]; } - const CeedInt P = transpose ? BASIS_Q_1D : BASIS_P_1D; - const CeedInt Q = transpose ? BASIS_P_1D : BASIS_Q_1D; - const CeedInt stride_0 = transpose ? 1 : BASIS_P_1D; - const CeedInt stride_1 = transpose ? BASIS_P_1D : 1; - const CeedInt u_stride = transpose ? BASIS_NUM_QPTS : BASIS_NUM_NODES; - const CeedInt v_stride = transpose ? BASIS_NUM_NODES : BASIS_NUM_QPTS; - const CeedInt u_comp_stride = num_elem * (transpose ? BASIS_NUM_QPTS : BASIS_NUM_NODES); - const CeedInt v_comp_stride = num_elem * (transpose ? BASIS_NUM_NODES : BASIS_NUM_QPTS); - const CeedInt u_size = transpose ? BASIS_NUM_QPTS : BASIS_NUM_NODES; + const CeedInt P = is_transpose ? BASIS_Q_1D : BASIS_P_1D; + const CeedInt Q = is_transpose ? BASIS_P_1D : BASIS_Q_1D; + const CeedInt stride_0 = is_transpose ? 1 : BASIS_P_1D; + const CeedInt stride_1 = is_transpose ? BASIS_P_1D : 1; + const CeedInt u_stride = is_transpose ? BASIS_NUM_QPTS : BASIS_NUM_NODES; + const CeedInt v_stride = is_transpose ? BASIS_NUM_NODES : BASIS_NUM_QPTS; + const CeedInt u_comp_stride = num_elem * (is_transpose ? BASIS_NUM_QPTS : BASIS_NUM_NODES); + const CeedInt v_comp_stride = num_elem * (is_transpose ? BASIS_NUM_NODES : BASIS_NUM_QPTS); + const CeedInt u_size = is_transpose ? BASIS_NUM_QPTS : BASIS_NUM_NODES; // Apply basis element by element for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) { for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) { - const CeedScalar *cur_u = u + elem * u_stride + comp * u_comp_stride; - CeedScalar *cur_v = v + elem * v_stride + comp * v_comp_stride; + const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride]; + CeedScalar *cur_v = &v[elem * v_stride + comp * v_comp_stride]; CeedInt pre = u_size; CeedInt post = 1; - for (CeedInt k = i; k < u_size; k += blockDim.x) { - s_buffer_1[k] = cur_u[k]; - } for (CeedInt d = 0; d < BASIS_DIM; d++) { __syncthreads(); // Update buffers used pre /= P; - const CeedScalar *in = d % 2 ? s_buffer_2 : s_buffer_1; + const CeedScalar *in = d == 0 ? cur_u : (d % 2 ? s_buffer_2 : s_buffer_1); CeedScalar *out = d == BASIS_DIM - 1 ? cur_v : (d % 2 ? s_buffer_1 : s_buffer_2); const CeedInt writeLen = pre * post * Q; // Contract along middle index for (CeedInt k = i; k < writeLen; k += blockDim.x) { - const CeedInt c = k % post; - const CeedInt j = (k / post) % Q; - const CeedInt a = k / (post * Q); - CeedScalar vk = 0; - - for (CeedInt b = 0; b < P; b++) vk += s_interp_1d[j * stride_0 + b * stride_1] * in[(a * P + b) * post + c]; - out[k] = vk; + const CeedInt c = k % post; + const CeedInt j = (k / post) % Q; + const CeedInt a = k / (post * Q); + CeedScalar v_k = 0; + + for (CeedInt b = 0; b < P; b++) v_k += s_interp_1d[j * stride_0 + b * stride_1] * in[(a * P + b) * post + c]; + if (is_transpose && d == BASIS_DIM - 1) out[k] += v_k; + else out[k] = v_k; } post *= Q; } @@ -77,7 +74,7 @@ extern "C" __global__ void Interp(const CeedInt num_elem, const CeedInt transpos //------------------------------------------------------------------------------ // Grad //------------------------------------------------------------------------------ -extern "C" __global__ void Grad(const CeedInt num_elem, const CeedInt transpose, const CeedScalar *__restrict__ interp_1d, +extern "C" __global__ void Grad(const CeedInt num_elem, const CeedInt is_transpose, const CeedScalar *__restrict__ interp_1d, const CeedScalar *__restrict__ grad_1d, const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { const CeedInt i = threadIdx.x; @@ -91,26 +88,26 @@ extern "C" __global__ void Grad(const CeedInt num_elem, const CeedInt transpose, s_grad_1d[k] = grad_1d[k]; } - const CeedInt P = transpose ? BASIS_Q_1D : BASIS_P_1D; - const CeedInt Q = transpose ? BASIS_P_1D : BASIS_Q_1D; - const CeedInt stride_0 = transpose ? 1 : BASIS_P_1D; - const CeedInt stride_1 = transpose ? BASIS_P_1D : 1; - const CeedInt u_stride = transpose ? BASIS_NUM_QPTS : BASIS_NUM_NODES; - const CeedInt v_stride = transpose ? BASIS_NUM_NODES : BASIS_NUM_QPTS; - const CeedInt u_comp_stride = num_elem * (transpose ? BASIS_NUM_QPTS : BASIS_NUM_NODES); - const CeedInt v_comp_stride = num_elem * (transpose ? BASIS_NUM_NODES : BASIS_NUM_QPTS); - const CeedInt u_dim_stride = transpose ? num_elem * BASIS_NUM_QPTS * BASIS_NUM_COMP : 0; - const CeedInt v_dim_stride = transpose ? 0 : num_elem * BASIS_NUM_QPTS * BASIS_NUM_COMP; + const CeedInt P = is_transpose ? BASIS_Q_1D : BASIS_P_1D; + const CeedInt Q = is_transpose ? BASIS_P_1D : BASIS_Q_1D; + const CeedInt stride_0 = is_transpose ? 1 : BASIS_P_1D; + const CeedInt stride_1 = is_transpose ? BASIS_P_1D : 1; + const CeedInt u_stride = is_transpose ? BASIS_NUM_QPTS : BASIS_NUM_NODES; + const CeedInt v_stride = is_transpose ? BASIS_NUM_NODES : BASIS_NUM_QPTS; + const CeedInt u_comp_stride = num_elem * (is_transpose ? BASIS_NUM_QPTS : BASIS_NUM_NODES); + const CeedInt v_comp_stride = num_elem * (is_transpose ? BASIS_NUM_NODES : BASIS_NUM_QPTS); + const CeedInt u_dim_stride = is_transpose ? num_elem * BASIS_NUM_QPTS * BASIS_NUM_COMP : 0; + const CeedInt v_dim_stride = is_transpose ? 0 : num_elem * BASIS_NUM_QPTS * BASIS_NUM_COMP; // Apply basis element by element for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) { for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) { // dim*dim contractions for grad for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) { - CeedInt pre = transpose ? BASIS_NUM_QPTS : BASIS_NUM_NODES; + CeedInt pre = is_transpose ? BASIS_NUM_QPTS : BASIS_NUM_NODES; CeedInt post = 1; - const CeedScalar *cur_u = u + elem * u_stride + dim_1 * u_dim_stride + comp * u_comp_stride; - CeedScalar *cur_v = v + elem * v_stride + dim_1 * v_dim_stride + comp * v_comp_stride; + const CeedScalar *cur_u = &u[elem * u_stride + dim_1 * u_dim_stride + comp * u_comp_stride]; + CeedScalar *cur_v = &v[elem * v_stride + dim_1 * v_dim_stride + comp * v_comp_stride]; for (CeedInt dim_2 = 0; dim_2 < BASIS_DIM; dim_2++) { __syncthreads(); @@ -129,7 +126,7 @@ extern "C" __global__ void Grad(const CeedInt num_elem, const CeedInt transpose, CeedScalar v_k = 0; for (CeedInt b = 0; b < P; b++) v_k += op[j * stride_0 + b * stride_1] * in[(a * P + b) * post + c]; - if (transpose && dim_2 == BASIS_DIM - 1) out[k] += v_k; + if (is_transpose && dim_2 == BASIS_DIM - 1) out[k] += v_k; else out[k] = v_k; } post *= Q; diff --git a/include/ceed/jit-source/cuda/cuda-ref-operator-assemble-diagonal.h b/include/ceed/jit-source/cuda/cuda-ref-operator-assemble-diagonal.h index df5b9ad338..5be93d9a1e 100644 --- a/include/ceed/jit-source/cuda/cuda-ref-operator-assemble-diagonal.h +++ b/include/ceed/jit-source/cuda/cuda-ref-operator-assemble-diagonal.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -7,7 +7,7 @@ /// @file /// Internal header for CUDA operator diagonal assembly -#include +#include #if USE_CEEDSIZE typedef CeedSize IndexType; diff --git a/include/ceed/jit-source/cuda/cuda-ref-operator-assemble.h b/include/ceed/jit-source/cuda/cuda-ref-operator-assemble.h index 6333f771f2..76643040fb 100644 --- a/include/ceed/jit-source/cuda/cuda-ref-operator-assemble.h +++ b/include/ceed/jit-source/cuda/cuda-ref-operator-assemble.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -7,8 +7,7 @@ /// @file /// Internal header for CUDA operator full assembly - -#include +#include #if USE_CEEDSIZE typedef CeedSize IndexType; @@ -24,7 +23,7 @@ extern "C" __launch_bounds__(BLOCK_SIZE) __global__ const CeedInt8 *curl_orients_in, const bool *orients_out, const CeedInt8 *curl_orients_out, const CeedScalar *__restrict__ qf_array, CeedScalar *__restrict__ values_array) { extern __shared__ CeedScalar s_CT[]; - CeedScalar *s_C = s_CT + NUM_NODES_OUT * NUM_NODES_IN; + CeedScalar *s_C = &s_CT[NUM_NODES_OUT * NUM_NODES_IN]; const int l = threadIdx.x; // The output column index of each B^T D B operation // such that we have (Bout^T)_ij D_jk Bin_kl = C_il @@ -62,7 +61,7 @@ extern "C" __launch_bounds__(BLOCK_SIZE) __global__ result += B_out[b_out_index + j * NUM_NODES_OUT + i] * qf_array[qf_index + j] * B_in[b_in_index + j * NUM_NODES_IN + l]; } } // end of out eval mode - } // end of in eval mode + } // end of in eval mode if (orients_in) { result *= orients_in[NUM_NODES_IN * e + l] ? -1.0 : 1.0; } @@ -101,6 +100,6 @@ extern "C" __launch_bounds__(BLOCK_SIZE) __global__ } } } // end of out component - } // end of in component - } // end of element loop + } // end of in component + } // end of element loop } diff --git a/include/ceed/jit-source/cuda/cuda-ref-qfunction.h b/include/ceed/jit-source/cuda/cuda-ref-qfunction.h index 7fbf7901bc..61785cc00c 100644 --- a/include/ceed/jit-source/cuda/cuda-ref-qfunction.h +++ b/include/ceed/jit-source/cuda/cuda-ref-qfunction.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -7,8 +7,7 @@ /// @file /// Internal header for CUDA backend QFunction read/write kernels - -#include +#include //------------------------------------------------------------------------------ // Read from quadrature points diff --git a/include/ceed/jit-source/cuda/cuda-ref-restriction-at-points.h b/include/ceed/jit-source/cuda/cuda-ref-restriction-at-points.h new file mode 100644 index 0000000000..73ecc3bb25 --- /dev/null +++ b/include/ceed/jit-source/cuda/cuda-ref-restriction-at-points.h @@ -0,0 +1,56 @@ +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. +// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +// +// SPDX-License-Identifier: BSD-2-Clause +// +// This file is part of CEED: http://github.com/ceed + +/// @file +/// Internal header for CUDA offset element restriction kernels +#include + +//------------------------------------------------------------------------------ +// E-vector -> L-vector, standard (with offsets) +//------------------------------------------------------------------------------ +#if !USE_DETERMINISTIC +extern "C" __global__ void AtPointsTranspose(const CeedInt *__restrict__ indices, const CeedInt *__restrict__ points_per_elem, + const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { + for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < RSTR_NUM_ELEM * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) { + const CeedInt ind = indices[node]; + const CeedInt loc_node = node % RSTR_ELEM_SIZE; + const CeedInt elem = node / RSTR_ELEM_SIZE; + + if (loc_node >= points_per_elem[elem]) continue; + for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { + atomicAdd(&v[ind + comp * RSTR_COMP_STRIDE], u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE]); + } + } +} +#else +extern "C" __global__ void AtPointsTranspose(const CeedInt *__restrict__ l_vec_indices, const CeedInt *__restrict__ t_indices, + const CeedInt *__restrict__ points_per_elem, const CeedInt *__restrict__ t_offsets, + const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { + CeedScalar value[RSTR_NUM_COMP]; + + for (CeedInt i = blockIdx.x * blockDim.x + threadIdx.x; i < RSTR_NUM_NODES; i += blockDim.x * gridDim.x) { + const CeedInt ind = l_vec_indices[i]; + const CeedInt range_1 = t_offsets[i]; + const CeedInt range_N = t_offsets[i + 1]; + + for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) value[comp] = 0.0; + + for (CeedInt j = range_1; j < range_N; j++) { + const CeedInt t_ind = t_indices[j]; + const CeedInt loc_node = t_ind % RSTR_ELEM_SIZE; + const CeedInt elem = t_ind / RSTR_ELEM_SIZE; + + if (loc_node >= points_per_elem[elem]) continue; + for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { + value[comp] += u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE]; + } + } + + for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) v[ind + comp * RSTR_COMP_STRIDE] += value[comp]; + } +} +#endif diff --git a/include/ceed/jit-source/cuda/cuda-ref-restriction-curl-oriented.h b/include/ceed/jit-source/cuda/cuda-ref-restriction-curl-oriented.h index d317f42cc5..e83eebb8cd 100644 --- a/include/ceed/jit-source/cuda/cuda-ref-restriction-curl-oriented.h +++ b/include/ceed/jit-source/cuda/cuda-ref-restriction-curl-oriented.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -7,8 +7,7 @@ /// @file /// Internal header for CUDA curl-oriented element restriction kernels - -#include +#include //------------------------------------------------------------------------------ // L-vector -> E-vector, curl-oriented @@ -80,7 +79,7 @@ extern "C" __global__ void CurlOrientedTranspose(const CeedInt *__restrict__ ind value += u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_d; value += loc_node < (RSTR_ELEM_SIZE - 1) ? u[loc_node + 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_dl : 0.0; - atomicAdd(v + ind + comp * RSTR_COMP_STRIDE, value); + atomicAdd(&v[ind + comp * RSTR_COMP_STRIDE], value); } } } @@ -138,7 +137,7 @@ extern "C" __global__ void CurlOrientedUnsignedTranspose(const CeedInt *__restri value += u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_d; value += loc_node < (RSTR_ELEM_SIZE - 1) ? u[loc_node + 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_dl : 0.0; - atomicAdd(v + ind + comp * RSTR_COMP_STRIDE, value); + atomicAdd(&v[ind + comp * RSTR_COMP_STRIDE], value); } } } diff --git a/include/ceed/jit-source/cuda/cuda-ref-restriction-offset.h b/include/ceed/jit-source/cuda/cuda-ref-restriction-offset.h index 0bd3dc0dd8..487c4d2194 100644 --- a/include/ceed/jit-source/cuda/cuda-ref-restriction-offset.h +++ b/include/ceed/jit-source/cuda/cuda-ref-restriction-offset.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -7,8 +7,7 @@ /// @file /// Internal header for CUDA offset element restriction kernels - -#include +#include //------------------------------------------------------------------------------ // L-vector -> E-vector, standard (with offsets) @@ -36,7 +35,7 @@ extern "C" __global__ void OffsetTranspose(const CeedInt *__restrict__ indices, const CeedInt elem = node / RSTR_ELEM_SIZE; for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { - atomicAdd(v + ind + comp * RSTR_COMP_STRIDE, u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE]); + atomicAdd(&v[ind + comp * RSTR_COMP_STRIDE], u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE]); } } } diff --git a/include/ceed/jit-source/cuda/cuda-ref-restriction-oriented.h b/include/ceed/jit-source/cuda/cuda-ref-restriction-oriented.h index d36f27277e..ead457562a 100644 --- a/include/ceed/jit-source/cuda/cuda-ref-restriction-oriented.h +++ b/include/ceed/jit-source/cuda/cuda-ref-restriction-oriented.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -7,8 +7,7 @@ /// @file /// Internal header for CUDA oriented element restriction kernels - -#include +#include //------------------------------------------------------------------------------ // L-vector -> E-vector, oriented @@ -40,7 +39,7 @@ extern "C" __global__ void OrientedTranspose(const CeedInt *__restrict__ indices const CeedInt elem = node / RSTR_ELEM_SIZE; for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { - atomicAdd(v + ind + comp * RSTR_COMP_STRIDE, + atomicAdd(&v[ind + comp * RSTR_COMP_STRIDE], u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * (orient ? -1.0 : 1.0)); } } diff --git a/include/ceed/jit-source/cuda/cuda-ref-restriction-strided.h b/include/ceed/jit-source/cuda/cuda-ref-restriction-strided.h index d10f73c11d..c5dc12b227 100644 --- a/include/ceed/jit-source/cuda/cuda-ref-restriction-strided.h +++ b/include/ceed/jit-source/cuda/cuda-ref-restriction-strided.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -7,8 +7,7 @@ /// @file /// Internal header for CUDA strided element restriction kernels - -#include +#include //------------------------------------------------------------------------------ // L-vector -> E-vector, strided diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-nontensor-templates.h b/include/ceed/jit-source/cuda/cuda-shared-basis-nontensor-templates.h new file mode 100644 index 0000000000..d49bc52a4b --- /dev/null +++ b/include/ceed/jit-source/cuda/cuda-shared-basis-nontensor-templates.h @@ -0,0 +1,98 @@ +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. +// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +// +// SPDX-License-Identifier: BSD-2-Clause +// +// This file is part of CEED: http://github.com/ceed + +/// @file +/// Internal header for CUDA shared memory non-tensor basis templates +#include + +//------------------------------------------------------------------------------ +// 1D tensor contraction +//------------------------------------------------------------------------------ +template +inline __device__ void Contract1d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { + data.slice[data.t_id_x] = *U; + __syncthreads(); + *V = 0.0; + if (data.t_id_x < Q_1D) { + for (CeedInt i = 0; i < P_1D; i++) { + *V += B[i + data.t_id_x * P_1D] * data.slice[i]; // Contract x direction + } + } + __syncthreads(); +} + +//------------------------------------------------------------------------------ +// 1D transpose tensor contraction +//------------------------------------------------------------------------------ +template +inline __device__ void ContractTranspose1d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { + data.slice[data.t_id_x] = *U; + __syncthreads(); + if (data.t_id_x < P_1D) { + for (CeedInt i = 0; i < Q_1D; i++) { + *V += B[data.t_id_x + i * P_1D] * data.slice[i]; // Contract x direction + } + } + __syncthreads(); +} + +//------------------------------------------------------------------------------ +// Interpolate to quadrature points +//------------------------------------------------------------------------------ +template +inline __device__ void InterpNonTensor(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, + CeedScalar *__restrict__ r_V) { + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + Contract1d(data, &r_U[comp], c_B, &r_V[comp]); + } +} + +//------------------------------------------------------------------------------ +// Interpolate transpose +//------------------------------------------------------------------------------ +template +inline __device__ void InterpTransposeNonTensor(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, + CeedScalar *__restrict__ r_V) { + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + r_V[comp] = 0.0; + ContractTranspose1d(data, &r_U[comp], c_B, &r_V[comp]); + } +} + +//------------------------------------------------------------------------------ +// Derivatives at quadrature points +//------------------------------------------------------------------------------ +template +inline __device__ void GradNonTensor(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { + for (CeedInt dim = 0; dim < DIM; dim++) { + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + Contract1d(data, &r_U[comp], &c_G[dim * P * Q], &r_V[comp + dim * NUM_COMP]); + } + } +} + +//------------------------------------------------------------------------------ +// Derivatives transpose +//------------------------------------------------------------------------------ +template +inline __device__ void GradTransposeNonTensor(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, + CeedScalar *__restrict__ r_V) { + for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_V[comp] = 0.0; + for (CeedInt dim = 0; dim < DIM; dim++) { + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + ContractTranspose1d(data, &r_U[comp + dim * NUM_COMP], &c_G[dim * P * Q], &r_V[comp]); + } + } +} + +//------------------------------------------------------------------------------ +// Quadrature weights +//------------------------------------------------------------------------------ +template +inline __device__ void WeightNonTensor(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight, CeedScalar *w) { + *w = (data.t_id_x < Q) ? q_weight[data.t_id_x] : 0.0; +} diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-nontensor.h b/include/ceed/jit-source/cuda/cuda-shared-basis-nontensor.h new file mode 100644 index 0000000000..abddaa58cd --- /dev/null +++ b/include/ceed/jit-source/cuda/cuda-shared-basis-nontensor.h @@ -0,0 +1,200 @@ +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. +// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +// +// SPDX-License-Identifier: BSD-2-Clause +// +// This file is part of CEED: http://github.com/ceed + +/// @file +/// Internal header for CUDA shared memory non-tensor basis +#include + +#include "cuda-shared-basis-nontensor-templates.h" +#include "cuda-shared-basis-read-write-templates.h" + +//------------------------------------------------------------------------------ +// Interp kernels +//------------------------------------------------------------------------------ +extern "C" __global__ void Interp(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { + extern __shared__ CeedScalar slice[]; + + SharedData_Cuda data; + data.t_id_x = threadIdx.x; + data.t_id_y = threadIdx.y; + data.t_id_z = threadIdx.z; + data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; + data.slice = slice + data.t_id_z * BASIS_T_1D; + + CeedScalar r_U[BASIS_NUM_COMP]; + CeedScalar r_V[BASIS_NUM_COMP]; + + // load interp into shared memory + __shared__ CeedScalar s_B[BASIS_P * BASIS_Q]; + LoadMatrix(data, c_B, s_B); + __syncthreads(); + + // Apply basis element by element + for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { + ReadElementStrided1d(data, elem, 1, BASIS_P * num_elem, BASIS_P, d_U, r_U); + InterpNonTensor(data, r_U, s_B, r_V); + WriteElementStrided1d(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, r_V, d_V); + } +} + +extern "C" __global__ void InterpTranspose(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U, + CeedScalar *__restrict__ d_V) { + extern __shared__ CeedScalar slice[]; + + SharedData_Cuda data; + data.t_id_x = threadIdx.x; + data.t_id_y = threadIdx.y; + data.t_id_z = threadIdx.z; + data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; + data.slice = slice + data.t_id_z * BASIS_T_1D; + + CeedScalar r_U[BASIS_NUM_COMP]; + CeedScalar r_V[BASIS_NUM_COMP]; + + // load interp into shared memory + __shared__ CeedScalar s_B[BASIS_P * BASIS_Q]; + LoadMatrix(data, c_B, s_B); + __syncthreads(); + + // Apply basis element by element + for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { + ReadElementStrided1d(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, d_U, r_U); + InterpTransposeNonTensor(data, r_U, s_B, r_V); + WriteElementStrided1d(data, elem, 1, BASIS_P * num_elem, BASIS_P, r_V, d_V); + } +} + +extern "C" __global__ void InterpTransposeAdd(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U, + CeedScalar *__restrict__ d_V) { + extern __shared__ CeedScalar slice[]; + + SharedData_Cuda data; + data.t_id_x = threadIdx.x; + data.t_id_y = threadIdx.y; + data.t_id_z = threadIdx.z; + data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; + data.slice = slice + data.t_id_z * BASIS_T_1D; + + CeedScalar r_U[BASIS_NUM_COMP]; + CeedScalar r_V[BASIS_NUM_COMP]; + + // load interp into shared memory + __shared__ CeedScalar s_B[BASIS_P * BASIS_Q]; + LoadMatrix(data, c_B, s_B); + __syncthreads(); + + // Apply basis element by element + for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { + ReadElementStrided1d(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, d_U, r_U); + InterpTransposeNonTensor(data, r_U, s_B, r_V); + SumElementStrided1d(data, elem, 1, BASIS_P * num_elem, BASIS_P, r_V, d_V); + } +} + +//------------------------------------------------------------------------------ +// Grad kernels +//------------------------------------------------------------------------------ +extern "C" __global__ void Grad(const CeedInt num_elem, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { + extern __shared__ CeedScalar slice[]; + + SharedData_Cuda data; + data.t_id_x = threadIdx.x; + data.t_id_y = threadIdx.y; + data.t_id_z = threadIdx.z; + data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; + data.slice = slice + data.t_id_z * BASIS_T_1D; + + CeedScalar r_U[BASIS_NUM_COMP]; + CeedScalar r_V[BASIS_NUM_COMP * BASIS_DIM]; + + // load grad into shared memory + __shared__ CeedScalar s_G[BASIS_P * BASIS_Q * BASIS_DIM]; + LoadMatrix(data, c_G, s_G); + __syncthreads(); + + // Apply basis element by element + for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { + ReadElementStrided1d(data, elem, 1, BASIS_P * num_elem, BASIS_P, d_U, r_U); + GradNonTensor(data, r_U, s_G, r_V); + WriteElementStrided1d(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, r_V, d_V); + } +} + +extern "C" __global__ void GradTranspose(const CeedInt num_elem, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U, + CeedScalar *__restrict__ d_V) { + extern __shared__ CeedScalar slice[]; + + SharedData_Cuda data; + data.t_id_x = threadIdx.x; + data.t_id_y = threadIdx.y; + data.t_id_z = threadIdx.z; + data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; + data.slice = slice + data.t_id_z * BASIS_T_1D; + + CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM]; + CeedScalar r_V[BASIS_NUM_COMP]; + + // load grad into shared memory + __shared__ CeedScalar s_G[BASIS_P * BASIS_Q * BASIS_DIM]; + LoadMatrix(data, c_G, s_G); + __syncthreads(); + + // Apply basis element by element + for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { + ReadElementStrided1d(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, d_U, r_U); + GradTransposeNonTensor(data, r_U, s_G, r_V); + WriteElementStrided1d(data, elem, 1, BASIS_P * num_elem, BASIS_P, r_V, d_V); + } +} + +extern "C" __global__ void GradTransposeAdd(const CeedInt num_elem, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U, + CeedScalar *__restrict__ d_V) { + extern __shared__ CeedScalar slice[]; + + SharedData_Cuda data; + data.t_id_x = threadIdx.x; + data.t_id_y = threadIdx.y; + data.t_id_z = threadIdx.z; + data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; + data.slice = slice + data.t_id_z * BASIS_T_1D; + + CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM]; + CeedScalar r_V[BASIS_NUM_COMP]; + + // load grad into shared memory + __shared__ CeedScalar s_G[BASIS_P * BASIS_Q * BASIS_DIM]; + LoadMatrix(data, c_G, s_G); + __syncthreads(); + + // Apply basis element by element + for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { + ReadElementStrided1d(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, d_U, r_U); + GradTransposeNonTensor(data, r_U, s_G, r_V); + SumElementStrided1d(data, elem, 1, BASIS_P * num_elem, BASIS_P, r_V, d_V); + } +} + +//------------------------------------------------------------------------------ +// Weight kernel +//------------------------------------------------------------------------------ +extern "C" __global__ void Weight(const CeedInt num_elem, const CeedScalar *__restrict__ q_weight, CeedScalar *__restrict__ d_W) { + extern __shared__ CeedScalar slice[]; + + SharedData_Cuda data; + data.t_id_x = threadIdx.x; + data.t_id_y = threadIdx.y; + data.t_id_z = threadIdx.z; + data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; + data.slice = slice + data.t_id_z * BASIS_T_1D; + + CeedScalar r_W[1]; + + for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { + WeightNonTensor(data, q_weight, r_W); + WriteElementStrided1d<1, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, r_W, d_W); + } +} diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-read-write-templates.h b/include/ceed/jit-source/cuda/cuda-shared-basis-read-write-templates.h index b10ba108f8..ececd93ae6 100644 --- a/include/ceed/jit-source/cuda/cuda-shared-basis-read-write-templates.h +++ b/include/ceed/jit-source/cuda/cuda-shared-basis-read-write-templates.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -7,8 +7,15 @@ /// @file /// Internal header for CUDA shared memory basis read/write templates +#include -#include +//------------------------------------------------------------------------------ +// Load matrices for basis actions +//------------------------------------------------------------------------------ +template +inline __device__ void LoadMatrix(SharedData_Cuda &data, const CeedScalar *__restrict__ d_B, CeedScalar *B) { + for (CeedInt i = data.t_id; i < P * Q; i += blockDim.x * blockDim.y * blockDim.z) B[i] = d_B[i]; +} //------------------------------------------------------------------------------ // 1D @@ -46,6 +53,19 @@ inline __device__ void WriteElementStrided1d(SharedData_Cuda &data, const CeedIn } } +template +inline __device__ void SumElementStrided1d(SharedData_Cuda &data, const CeedInt elem, const CeedInt strides_node, const CeedInt strides_comp, + const CeedInt strides_elem, const CeedScalar *r_v, CeedScalar *d_v) { + if (data.t_id_x < P_1D) { + const CeedInt node = data.t_id_x; + const CeedInt ind = node * strides_node + elem * strides_elem; + + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + d_v[ind + comp * strides_comp] += r_v[comp]; + } + } +} + //------------------------------------------------------------------------------ // 2D //------------------------------------------------------------------------------ @@ -82,6 +102,19 @@ inline __device__ void WriteElementStrided2d(SharedData_Cuda &data, const CeedIn } } +template +inline __device__ void SumElementStrided2d(SharedData_Cuda &data, const CeedInt elem, const CeedInt strides_node, const CeedInt strides_comp, + const CeedInt strides_elem, const CeedScalar *r_v, CeedScalar *d_v) { + if (data.t_id_x < P_1D && data.t_id_y < P_1D) { + const CeedInt node = data.t_id_x + data.t_id_y * P_1D; + const CeedInt ind = node * strides_node + elem * strides_elem; + + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + d_v[ind + comp * strides_comp] += r_v[comp]; + } + } +} + //------------------------------------------------------------------------------ // 3D //------------------------------------------------------------------------------ @@ -121,3 +154,58 @@ inline __device__ void WriteElementStrided3d(SharedData_Cuda &data, const CeedIn } } } + +template +inline __device__ void SumElementStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedInt strides_node, const CeedInt strides_comp, + const CeedInt strides_elem, const CeedScalar *r_v, CeedScalar *d_v) { + if (data.t_id_x < P_1D && data.t_id_y < P_1D) { + for (CeedInt z = 0; z < P_1D; z++) { + const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D; + const CeedInt ind = node * strides_node + elem * strides_elem; + + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + d_v[ind + comp * strides_comp] += r_v[z + comp * P_1D]; + } + } + } +} + +//------------------------------------------------------------------------------ +// AtPoints +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ +// E-vector -> single point +//------------------------------------------------------------------------------ +template +inline __device__ void ReadPoint(SharedData_Cuda &data, const CeedInt elem, const CeedInt p, const CeedInt points_in_elem, + const CeedInt strides_point, const CeedInt strides_comp, const CeedInt strides_elem, + const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { + const CeedInt ind = (p % NUM_PTS) * strides_point + elem * strides_elem; + + if (p < points_in_elem) { + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + r_u[comp] = d_u[ind + comp * strides_comp]; + } + } else { + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + r_u[comp] = 0.0; + } + } +} + +//------------------------------------------------------------------------------ +// Single point -> E-vector +//------------------------------------------------------------------------------ +template +inline __device__ void WritePoint(SharedData_Cuda &data, const CeedInt elem, const CeedInt p, const CeedInt points_in_elem, + const CeedInt strides_point, const CeedInt strides_comp, const CeedInt strides_elem, const CeedScalar *r_v, + CeedScalar *d_v) { + if (p < points_in_elem) { + const CeedInt ind = (p % NUM_PTS) * strides_point + elem * strides_elem; + + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + d_v[ind + comp * strides_comp] = r_v[comp]; + } + } +} diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h new file mode 100644 index 0000000000..6f2843acce --- /dev/null +++ b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h @@ -0,0 +1,467 @@ +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. +// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +// +// SPDX-License-Identifier: BSD-2-Clause +// +// This file is part of CEED: http://github.com/ceed + +/// @file +/// Internal header for CUDA shared memory tensor product basis AtPoints templates +#include + +//------------------------------------------------------------------------------ +// Chebyshev values +//------------------------------------------------------------------------------ +template +inline __device__ void ChebyshevPolynomialsAtPoint(const CeedScalar x, CeedScalar *chebyshev_x) { + chebyshev_x[0] = 1.0; + chebyshev_x[1] = 2 * x; + for (CeedInt i = 2; i < Q_1D; i++) chebyshev_x[i] = 2 * x * chebyshev_x[i - 1] - chebyshev_x[i - 2]; +} + +template +inline __device__ void ChebyshevDerivativeAtPoint(const CeedScalar x, CeedScalar *chebyshev_dx) { + CeedScalar chebyshev_x[3]; + + chebyshev_x[1] = 1.0; + chebyshev_x[2] = 2 * x; + chebyshev_dx[0] = 0.0; + chebyshev_dx[1] = 2.0; + for (CeedInt i = 2; i < Q_1D; i++) { + chebyshev_x[(i + 1) % 3] = 2 * x * chebyshev_x[(i + 0) % 3] - chebyshev_x[(i + 2) % 3]; + chebyshev_dx[i] = 2 * x * chebyshev_dx[i - 1] + 2 * chebyshev_x[(i + 0) % 3] - chebyshev_dx[i - 2]; + } +} + +//------------------------------------------------------------------------------ +// 1D +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ +// 1D interpolate to points +//------------------------------------------------------------------------------ +template +inline __device__ void InterpAtPoints1d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X, + CeedScalar *__restrict__ r_V) { + CeedScalar chebyshev_x[Q_1D]; + + for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0; + ChebyshevPolynomialsAtPoint(r_X[0], chebyshev_x); + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + // Load coefficients + if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = r_C[comp]; + __syncthreads(); + // Contract x direction + for (CeedInt i = 0; i < Q_1D; i++) { + r_V[comp] += chebyshev_x[i] * data.slice[i]; + } + } +} + +//------------------------------------------------------------------------------ +// 1D interpolate transpose +//------------------------------------------------------------------------------ +template +inline __device__ void InterpTransposeAtPoints1d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X, + CeedScalar *__restrict__ r_C) { + CeedScalar chebyshev_x[Q_1D]; + + ChebyshevPolynomialsAtPoint(r_X[0], chebyshev_x); + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + // Clear shared memory + if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = 0.0; + __syncthreads(); + // Contract x direction + if (p < NUM_POINTS) { + for (CeedInt i = 0; i < Q_1D; i++) { + atomicAdd_block(&data.slice[comp * Q_1D + (i + data.t_id_x) % Q_1D], chebyshev_x[(i + data.t_id_x) % Q_1D] * r_U[comp]); + } + } + // Pull from shared to register + __syncthreads(); + if (data.t_id_x < Q_1D) r_C[comp] += data.slice[data.t_id_x]; + } +} + +//------------------------------------------------------------------------------ +// 1D derivatives at points +//------------------------------------------------------------------------------ +template +inline __device__ void GradAtPoints1d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X, + CeedScalar *__restrict__ r_V) { + CeedScalar chebyshev_x[Q_1D]; + + ChebyshevDerivativeAtPoint(r_X[0], chebyshev_x); + for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0; + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + // Load coefficients + __syncthreads(); + if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = r_C[comp]; + __syncthreads(); + // Contract x direction + for (CeedInt i = 0; i < Q_1D; i++) { + r_V[comp] += chebyshev_x[i] * data.slice[i]; + } + } +} + +//------------------------------------------------------------------------------ +// 1D derivatives transpose +//------------------------------------------------------------------------------ +template +inline __device__ void GradTransposeAtPoints1d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X, + CeedScalar *__restrict__ r_C) { + CeedScalar chebyshev_x[Q_1D]; + + ChebyshevDerivativeAtPoint(r_X[0], chebyshev_x); + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + // Clear shared memory + if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = 0.0; + __syncthreads(); + // Contract x direction + if (p < NUM_POINTS) { + for (CeedInt i = 0; i < Q_1D; i++) { + atomicAdd_block(&data.slice[comp * Q_1D + (i + data.t_id_x) % Q_1D], chebyshev_x[(i + data.t_id_x) % Q_1D] * r_U[comp]); + } + } + // Pull from shared to register + __syncthreads(); + if (data.t_id_x < Q_1D) r_C[comp] += data.slice[data.t_id_x]; + } +} + +//------------------------------------------------------------------------------ +// 2D +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ +// 2D interpolate to points +//------------------------------------------------------------------------------ +template +inline __device__ void InterpAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X, + CeedScalar *__restrict__ r_V) { + for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0; + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + CeedScalar buffer[Q_1D]; + CeedScalar chebyshev_x[Q_1D]; + + // Load coefficients + __syncthreads(); + if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[comp]; + __syncthreads(); + // Contract x direction + ChebyshevPolynomialsAtPoint(r_X[0], chebyshev_x); + for (CeedInt i = 0; i < Q_1D; i++) { + buffer[i] = 0.0; + for (CeedInt j = 0; j < Q_1D; j++) { + buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D]; + } + } + // Contract y direction + ChebyshevPolynomialsAtPoint(r_X[1], chebyshev_x); + for (CeedInt i = 0; i < Q_1D; i++) { + r_V[comp] += chebyshev_x[i] * buffer[i]; + } + } +} + +//------------------------------------------------------------------------------ +// 2D interpolate transpose +//------------------------------------------------------------------------------ +template +inline __device__ void InterpTransposeAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X, + CeedScalar *__restrict__ r_C) { + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + CeedScalar buffer[Q_1D]; + CeedScalar chebyshev_x[Q_1D]; + + // Clear shared memory + if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0; + __syncthreads(); + // Contract y direction + ChebyshevPolynomialsAtPoint(r_X[1], chebyshev_x); + const CeedScalar r_u = p < NUM_POINTS ? r_U[comp] : 0.0; + + for (CeedInt i = 0; i < Q_1D; i++) { + buffer[i] = chebyshev_x[i] * r_u; + } + // Contract x direction + ChebyshevPolynomialsAtPoint(r_X[0], chebyshev_x); + for (CeedInt i = 0; i < Q_1D; i++) { + // Note: shifting to avoid atomic adds + const CeedInt ii = (i + data.t_id_y) % Q_1D; + + for (CeedInt j = 0; j < Q_1D; j++) { + const CeedInt jj = (j + data.t_id_x) % Q_1D; + + if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) atomicAdd_block(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]); + } + } + // Pull from shared to register + __syncthreads(); + if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[comp] += data.slice[data.t_id_x + data.t_id_y * Q_1D]; + } +} + +//------------------------------------------------------------------------------ +// 2D derivatives at points +//------------------------------------------------------------------------------ +template +inline __device__ void GradAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X, + CeedScalar *__restrict__ r_V) { + for (CeedInt i = 0; i < NUM_COMP * 2; i++) r_V[i] = 0.0; + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + CeedScalar buffer[Q_1D]; + CeedScalar chebyshev_x[Q_1D]; + + // Load coefficients + __syncthreads(); + if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[comp]; + __syncthreads(); + for (CeedInt dim = 0; dim < 2; dim++) { + // Contract x direction + if (dim == 0) ChebyshevDerivativeAtPoint(r_X[0], chebyshev_x); + else ChebyshevPolynomialsAtPoint(r_X[0], chebyshev_x); + for (CeedInt i = 0; i < Q_1D; i++) { + buffer[i] = 0.0; + for (CeedInt j = 0; j < Q_1D; j++) { + buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D]; + } + } + // Contract y direction + if (dim == 1) ChebyshevDerivativeAtPoint(r_X[1], chebyshev_x); + else ChebyshevPolynomialsAtPoint(r_X[1], chebyshev_x); + for (CeedInt i = 0; i < Q_1D; i++) { + r_V[comp + dim * NUM_COMP] += chebyshev_x[i] * buffer[i]; + } + } + } +} + +//------------------------------------------------------------------------------ +// 2D derivatives transpose +//------------------------------------------------------------------------------ +template +inline __device__ void GradTransposeAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X, + CeedScalar *__restrict__ r_C) { + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + CeedScalar buffer[Q_1D]; + CeedScalar chebyshev_x[Q_1D]; + + // Clear shared memory + if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0; + __syncthreads(); + for (CeedInt dim = 0; dim < 2; dim++) { + // Contract y direction + if (dim == 1) ChebyshevDerivativeAtPoint(r_X[1], chebyshev_x); + else ChebyshevPolynomialsAtPoint(r_X[1], chebyshev_x); + const CeedScalar r_u = p < NUM_POINTS ? r_U[comp + dim * NUM_COMP] : 0.0; + + for (CeedInt i = 0; i < Q_1D; i++) { + buffer[i] = chebyshev_x[i] * r_u; + } + // Contract x direction + if (dim == 0) ChebyshevDerivativeAtPoint(r_X[0], chebyshev_x); + else ChebyshevPolynomialsAtPoint(r_X[0], chebyshev_x); + for (CeedInt i = 0; i < Q_1D; i++) { + // Note: shifting to avoid atomic adds + const CeedInt ii = (i + data.t_id_y) % Q_1D; + + for (CeedInt j = 0; j < Q_1D; j++) { + const CeedInt jj = (j + data.t_id_x) % Q_1D; + + if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) atomicAdd_block(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]); + } + } + } + // Pull from shared to register + __syncthreads(); + if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[comp] += data.slice[data.t_id_x + data.t_id_y * Q_1D]; + } +} + +//------------------------------------------------------------------------------ +// 3D +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ +// 3D interpolate to points +//------------------------------------------------------------------------------ +template +inline __device__ void InterpAtPoints3d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X, + CeedScalar *__restrict__ r_V) { + for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0; + for (CeedInt k = 0; k < Q_1D; k++) { + CeedScalar buffer[Q_1D]; + CeedScalar chebyshev_x[Q_1D]; + + // Get z contraction value + ChebyshevPolynomialsAtPoint(r_X[2], chebyshev_x); + const CeedScalar z = chebyshev_x[k]; + + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + // Load coefficients + __syncthreads(); + if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[k + comp * Q_1D]; + __syncthreads(); + // Contract x direction + ChebyshevPolynomialsAtPoint(r_X[0], chebyshev_x); + for (CeedInt i = 0; i < Q_1D; i++) { + buffer[i] = 0.0; + for (CeedInt j = 0; j < Q_1D; j++) { + buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D]; + } + } + // Contract y and z direction + ChebyshevPolynomialsAtPoint(r_X[1], chebyshev_x); + for (CeedInt i = 0; i < Q_1D; i++) { + r_V[comp] += chebyshev_x[i] * buffer[i] * z; + } + } + } +} + +//------------------------------------------------------------------------------ +// 3D interpolate transpose +//------------------------------------------------------------------------------ +template +inline __device__ void InterpTransposeAtPoints3d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X, + CeedScalar *__restrict__ r_C) { + for (CeedInt k = 0; k < Q_1D; k++) { + CeedScalar buffer[Q_1D]; + CeedScalar chebyshev_x[Q_1D]; + + // Get z contraction value + ChebyshevPolynomialsAtPoint(r_X[2], chebyshev_x); + const CeedScalar z = chebyshev_x[k]; + + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + // Clear shared memory + if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0; + __syncthreads(); + // Contract y and z direction + ChebyshevPolynomialsAtPoint(r_X[1], chebyshev_x); + const CeedScalar r_u = p < NUM_POINTS ? r_U[comp] : 0.0; + + for (CeedInt i = 0; i < Q_1D; i++) { + buffer[i] = chebyshev_x[i] * r_u * z; + } + // Contract x direction + ChebyshevPolynomialsAtPoint(r_X[0], chebyshev_x); + for (CeedInt i = 0; i < Q_1D; i++) { + // Note: shifting to avoid atomic adds + const CeedInt ii = (i + data.t_id_y) % Q_1D; + + for (CeedInt j = 0; j < Q_1D; j++) { + const CeedInt jj = (j + data.t_id_x) % Q_1D; + + if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) atomicAdd_block(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]); + } + } + // Pull from shared to register + __syncthreads(); + if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[k + comp * Q_1D] += data.slice[data.t_id_x + data.t_id_y * Q_1D]; + } + } +} + +//------------------------------------------------------------------------------ +// 3D derivatives at points +//------------------------------------------------------------------------------ +template +inline __device__ void GradAtPoints3d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X, + CeedScalar *__restrict__ r_V) { + for (CeedInt i = 0; i < NUM_COMP * 3; i++) r_V[i] = 0.0; + for (CeedInt k = 0; k < Q_1D; k++) { + CeedScalar buffer[Q_1D]; + CeedScalar chebyshev_x[Q_1D]; + + // Get z contraction values + ChebyshevPolynomialsAtPoint(r_X[2], chebyshev_x); + const CeedScalar z = chebyshev_x[k]; + + ChebyshevDerivativeAtPoint(r_X[2], chebyshev_x); + const CeedScalar dz = chebyshev_x[k]; + + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + // Load coefficients + __syncthreads(); + if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[k + comp * Q_1D]; + __syncthreads(); + // Gradient directions + for (CeedInt dim = 0; dim < 3; dim++) { + // Contract x direction + if (dim == 0) ChebyshevDerivativeAtPoint(r_X[0], chebyshev_x); + else ChebyshevPolynomialsAtPoint(r_X[0], chebyshev_x); + for (CeedInt i = 0; i < Q_1D; i++) { + buffer[i] = 0.0; + for (CeedInt j = 0; j < Q_1D; j++) { + buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D]; + } + } + // Contract y and z direction + if (dim == 1) ChebyshevDerivativeAtPoint(r_X[1], chebyshev_x); + else ChebyshevPolynomialsAtPoint(r_X[1], chebyshev_x); + const CeedScalar zz = dim == 2 ? dz : z; + + for (CeedInt i = 0; i < Q_1D; i++) { + r_V[comp + dim * NUM_COMP] += chebyshev_x[i] * buffer[i] * zz; + } + } + } + } +} + +//------------------------------------------------------------------------------ +// 3D derivatives transpose +//------------------------------------------------------------------------------ +template +inline __device__ void GradTransposeAtPoints3d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X, + CeedScalar *__restrict__ r_C) { + for (CeedInt k = 0; k < Q_1D; k++) { + CeedScalar buffer[Q_1D]; + CeedScalar chebyshev_x[Q_1D]; + + // Get z contraction values + ChebyshevPolynomialsAtPoint(r_X[2], chebyshev_x); + const CeedScalar z = chebyshev_x[k]; + + ChebyshevDerivativeAtPoint(r_X[2], chebyshev_x); + const CeedScalar dz = chebyshev_x[k]; + + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + // Clear shared memory + if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0; + __syncthreads(); + // Gradient directions + for (CeedInt dim = 0; dim < 3; dim++) { + // Contract y and z direction + if (dim == 1) ChebyshevDerivativeAtPoint(r_X[1], chebyshev_x); + else ChebyshevPolynomialsAtPoint(r_X[1], chebyshev_x); + const CeedScalar zz = dim == 2 ? dz : z; + const CeedScalar r_u = (p < NUM_POINTS) ? r_U[comp + dim * NUM_COMP] : 0.0; + + for (CeedInt i = 0; i < Q_1D; i++) { + buffer[i] = chebyshev_x[i] * r_u * zz; + } + + // Contract x direction + if (dim == 0) ChebyshevDerivativeAtPoint(r_X[0], chebyshev_x); + else ChebyshevPolynomialsAtPoint(r_X[0], chebyshev_x); + for (CeedInt i = 0; i < Q_1D; i++) { + // Note: shifting to avoid atomic adds + const CeedInt ii = (i + data.t_id_y) % Q_1D; + + for (CeedInt j = 0; j < Q_1D; j++) { + const CeedInt jj = (j + data.t_id_x) % Q_1D; + + if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) atomicAdd_block(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]); + } + } + } + // Pull from shared to register + __syncthreads(); + if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[k + comp * Q_1D] += data.slice[data.t_id_x + data.t_id_y * Q_1D]; + } + } +} diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points.h b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points.h new file mode 100644 index 0000000000..fc812792e4 --- /dev/null +++ b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points.h @@ -0,0 +1,394 @@ +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. +// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +// +// SPDX-License-Identifier: BSD-2-Clause +// +// This file is part of CEED: http://github.com/ceed + +/// @file +/// Internal header for CUDA tensor product basis with AtPoints evaluation +#include + +#include "cuda-shared-basis-read-write-templates.h" +#include "cuda-shared-basis-tensor-at-points-templates.h" +#include "cuda-shared-basis-tensor-templates.h" + +//------------------------------------------------------------------------------ +// Tensor Basis Kernels AtPoints +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ +// Interp +//------------------------------------------------------------------------------ +extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ c_B, const CeedInt *__restrict__ points_per_elem, + const CeedScalar *__restrict__ d_X, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { + extern __shared__ CeedScalar slice[]; + + SharedData_Cuda data; + data.t_id_x = threadIdx.x; + data.t_id_y = threadIdx.y; + data.t_id_z = threadIdx.z; + data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; + data.slice = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1); + + CeedScalar r_X[BASIS_DIM]; + CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)]; + CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; + CeedScalar r_V[BASIS_NUM_COMP]; + + // load interp_1d into shared memory + __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; + LoadMatrix(data, c_B, s_B); + __syncthreads(); + + // Apply basis element by element + for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { + // Map to coefficients + if (BASIS_DIM == 1) { + ReadElementStrided1d(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U); + Interp1d(data, r_U, s_B, r_C); + } else if (BASIS_DIM == 2) { + ReadElementStrided2d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U); + InterpTensor2d(data, r_U, s_B, r_C); + } else if (BASIS_DIM == 3) { + ReadElementStrided3d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, + BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U); + InterpTensor3d(data, r_U, s_B, r_C); + } + + // Map to points + const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y)); + + for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) { + const CeedInt p = i % BASIS_NUM_PTS; + + ReadPoint(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_X, r_X); + if (BASIS_DIM == 1) { + InterpAtPoints1d(data, i, r_C, r_X, r_V); + } else if (BASIS_DIM == 2) { + InterpAtPoints2d(data, i, r_C, r_X, r_V); + } else if (BASIS_DIM == 3) { + InterpAtPoints3d(data, i, r_C, r_X, r_V); + } + WritePoint(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, r_V, d_V); + } + } +} + +extern "C" __global__ void InterpTransposeAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ c_B, + const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ d_X, + const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { + extern __shared__ CeedScalar slice[]; + + SharedData_Cuda data; + data.t_id_x = threadIdx.x; + data.t_id_y = threadIdx.y; + data.t_id_z = threadIdx.z; + data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; + data.slice = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1); + + CeedScalar r_X[BASIS_DIM]; + CeedScalar r_U[BASIS_NUM_COMP]; + CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; + CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; + + // load interp_1d into shared memory + __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; + LoadMatrix(data, c_B, s_B); + __syncthreads(); + + // Apply basis element by element + for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { + // Clear register + for (CeedInt i = 0; i < BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1); i++) r_C[i] = 0.0; + + // Clear output vector + for (CeedInt i = 0; i < BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1); i++) r_V[i] = 0.0; + if (BASIS_DIM == 1) { + WriteElementStrided1d(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V); + } else if (BASIS_DIM == 2) { + WriteElementStrided2d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V); + } else if (BASIS_DIM == 3) { + WriteElementStrided3d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, + BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V); + } + + // Map from points + const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y)); + + for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) { + const CeedInt p = i % BASIS_NUM_PTS; + + ReadPoint(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_X, r_X); + ReadPoint(data, elem, i, points_per_elem[elem], 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_U, r_U); + if (BASIS_DIM == 1) { + InterpTransposeAtPoints1d(data, i, r_U, r_X, r_C); + } else if (BASIS_DIM == 2) { + InterpTransposeAtPoints2d(data, i, r_U, r_X, r_C); + } else if (BASIS_DIM == 3) { + InterpTransposeAtPoints3d(data, i, r_U, r_X, r_C); + } + } + + // Map from coefficients + if (BASIS_DIM == 1) { + InterpTranspose1d(data, r_C, s_B, r_V); + SumElementStrided1d(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V); + } else if (BASIS_DIM == 2) { + InterpTransposeTensor2d(data, r_C, s_B, r_V); + SumElementStrided2d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V); + } else if (BASIS_DIM == 3) { + InterpTransposeTensor3d(data, r_C, s_B, r_V); + SumElementStrided3d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, + BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V); + } + } +} + +extern "C" __global__ void InterpTransposeAddAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ c_B, + const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ d_X, + const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { + extern __shared__ CeedScalar slice[]; + + SharedData_Cuda data; + data.t_id_x = threadIdx.x; + data.t_id_y = threadIdx.y; + data.t_id_z = threadIdx.z; + data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; + data.slice = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1); + + CeedScalar r_X[BASIS_DIM]; + CeedScalar r_U[BASIS_NUM_COMP]; + CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; + CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; + + // load interp_1d into shared memory + __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; + LoadMatrix(data, c_B, s_B); + __syncthreads(); + + // Apply basis element by element + for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { + // Clear register + for (CeedInt i = 0; i < BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1); i++) r_C[i] = 0.0; + + // Map from points + const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y)); + + for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) { + const CeedInt p = i % BASIS_NUM_PTS; + + ReadPoint(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_X, r_X); + ReadPoint(data, elem, i, points_per_elem[elem], 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_U, r_U); + if (BASIS_DIM == 1) { + InterpTransposeAtPoints1d(data, i, r_U, r_X, r_C); + } else if (BASIS_DIM == 2) { + InterpTransposeAtPoints2d(data, i, r_U, r_X, r_C); + } else if (BASIS_DIM == 3) { + InterpTransposeAtPoints3d(data, i, r_U, r_X, r_C); + } + } + + // Map from coefficients + if (BASIS_DIM == 1) { + InterpTranspose1d(data, r_C, s_B, r_V); + SumElementStrided1d(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V); + } else if (BASIS_DIM == 2) { + InterpTransposeTensor2d(data, r_C, s_B, r_V); + SumElementStrided2d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V); + } else if (BASIS_DIM == 3) { + InterpTransposeTensor3d(data, r_C, s_B, r_V); + SumElementStrided3d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, + BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V); + } + } +} + +//------------------------------------------------------------------------------ +// Grad +//------------------------------------------------------------------------------ +extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ c_B, const CeedInt *__restrict__ points_per_elem, + const CeedScalar *__restrict__ d_X, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { + extern __shared__ CeedScalar slice[]; + + SharedData_Cuda data; + data.t_id_x = threadIdx.x; + data.t_id_y = threadIdx.y; + data.t_id_z = threadIdx.z; + data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; + data.slice = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1); + + CeedScalar r_X[BASIS_DIM]; + CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)]; + CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; + CeedScalar r_V[BASIS_NUM_COMP * BASIS_DIM]; + + // load interp_1d into shared memory + __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; + LoadMatrix(data, c_B, s_B); + __syncthreads(); + + // Apply basis element by element + for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { + // Map to coefficients + if (BASIS_DIM == 1) { + ReadElementStrided1d(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U); + Interp1d(data, r_U, s_B, r_C); + } else if (BASIS_DIM == 2) { + ReadElementStrided2d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U); + InterpTensor2d(data, r_U, s_B, r_C); + } else if (BASIS_DIM == 3) { + ReadElementStrided3d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, + BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U); + InterpTensor3d(data, r_U, s_B, r_C); + } + + // Map to points + const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y)); + + for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) { + const CeedInt p = i % BASIS_NUM_PTS; + + ReadPoint(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_X, r_X); + if (BASIS_DIM == 1) { + GradAtPoints1d(data, i, r_C, r_X, r_V); + } else if (BASIS_DIM == 2) { + GradAtPoints2d(data, i, r_C, r_X, r_V); + } else if (BASIS_DIM == 3) { + GradAtPoints3d(data, i, r_C, r_X, r_V); + } + WritePoint(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, r_V, d_V); + } + } +} + +extern "C" __global__ void GradTransposeAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ c_B, + const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ d_X, + const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { + extern __shared__ CeedScalar slice[]; + + SharedData_Cuda data; + data.t_id_x = threadIdx.x; + data.t_id_y = threadIdx.y; + data.t_id_z = threadIdx.z; + data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; + data.slice = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1); + + CeedScalar r_X[BASIS_DIM]; + CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM]; + CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; + CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; + + // load interp_1d into shared memory + __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; + LoadMatrix(data, c_B, s_B); + __syncthreads(); + + // Apply basis element by element + for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { + // Clear register + for (CeedInt i = 0; i < BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1); i++) r_C[i] = 0.0; + + // Clear output vector + for (CeedInt i = 0; i < BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1); i++) r_V[i] = 0.0; + if (BASIS_DIM == 1) { + WriteElementStrided1d(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V); + } else if (BASIS_DIM == 2) { + WriteElementStrided2d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V); + } else if (BASIS_DIM == 3) { + WriteElementStrided3d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, + BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V); + } + + // Map from points + const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y)); + + for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) { + const CeedInt p = i % BASIS_NUM_PTS; + + ReadPoint(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_X, r_X); + ReadPoint(data, elem, i, points_per_elem[elem], 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_U, + r_U); + if (BASIS_DIM == 1) { + GradTransposeAtPoints1d(data, i, r_U, r_X, r_C); + } else if (BASIS_DIM == 2) { + GradTransposeAtPoints2d(data, i, r_U, r_X, r_C); + } else if (BASIS_DIM == 3) { + GradTransposeAtPoints3d(data, i, r_U, r_X, r_C); + } + } + + // Map from coefficients + if (BASIS_DIM == 1) { + InterpTranspose1d(data, r_C, s_B, r_V); + SumElementStrided1d(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V); + } else if (BASIS_DIM == 2) { + InterpTransposeTensor2d(data, r_C, s_B, r_V); + SumElementStrided2d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V); + } else if (BASIS_DIM == 3) { + InterpTransposeTensor3d(data, r_C, s_B, r_V); + SumElementStrided3d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, + BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V); + } + } +} + +extern "C" __global__ void GradTransposeAddAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ c_B, + const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ d_X, + const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { + extern __shared__ CeedScalar slice[]; + + SharedData_Cuda data; + data.t_id_x = threadIdx.x; + data.t_id_y = threadIdx.y; + data.t_id_z = threadIdx.z; + data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; + data.slice = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1); + + CeedScalar r_X[BASIS_DIM]; + CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM]; + CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; + CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; + + // load interp_1d into shared memory + __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; + LoadMatrix(data, c_B, s_B); + __syncthreads(); + + // Apply basis element by element + for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { + // Clear register + for (CeedInt i = 0; i < BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1); i++) r_C[i] = 0.0; + + // Map from points + const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y)); + + for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) { + const CeedInt p = i % BASIS_NUM_PTS; + + ReadPoint(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_X, r_X); + ReadPoint(data, elem, i, points_per_elem[elem], 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_U, + r_U); + if (BASIS_DIM == 1) { + GradTransposeAtPoints1d(data, i, r_U, r_X, r_C); + } else if (BASIS_DIM == 2) { + GradTransposeAtPoints2d(data, i, r_U, r_X, r_C); + } else if (BASIS_DIM == 3) { + GradTransposeAtPoints3d(data, i, r_U, r_X, r_C); + } + } + + // Map from coefficients + if (BASIS_DIM == 1) { + InterpTranspose1d(data, r_C, s_B, r_V); + SumElementStrided1d(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V); + } else if (BASIS_DIM == 2) { + InterpTransposeTensor2d(data, r_C, s_B, r_V); + SumElementStrided2d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V); + } else if (BASIS_DIM == 3) { + InterpTransposeTensor3d(data, r_C, s_B, r_V); + SumElementStrided3d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, + BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V); + } + } +} diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-flattened-templates.h b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-flattened-templates.h new file mode 100644 index 0000000000..54594b3af4 --- /dev/null +++ b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-flattened-templates.h @@ -0,0 +1,680 @@ +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. +// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +// +// SPDX-License-Identifier: BSD-2-Clause +// +// This file is part of CEED: http://github.com/ceed + +/// @file +/// Internal header for CUDA shared memory tensor product basis templates +#include + +//------------------------------------------------------------------------------ +// 2D +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ +// 2D tensor contraction x +//------------------------------------------------------------------------------ +template +inline __device__ void ContractX2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B, + CeedScalar *V) { + __syncthreads(); + data.slice[t_id_x + t_id_y * T_1D] = *U; + __syncthreads(); + *V = 0.0; + if (t_id_x < Q_1D && t_id_y < P_1D) { + for (CeedInt i = 0; i < P_1D; i++) { + *V += B[i + t_id_x * P_1D] * data.slice[i + t_id_y * T_1D]; // Contract x direction + } + } +} + +//------------------------------------------------------------------------------ +// 2D tensor contract y +//------------------------------------------------------------------------------ +template +inline __device__ void ContractY2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B, + CeedScalar *V) { + __syncthreads(); + data.slice[t_id_x + t_id_y * T_1D] = *U; + __syncthreads(); + *V = 0.0; + if (t_id_x < Q_1D && t_id_y < Q_1D) { + for (CeedInt i = 0; i < P_1D; i++) { + *V += B[i + t_id_y * P_1D] * data.slice[t_id_x + i * T_1D]; // Contract y direction + } + } +} + +//------------------------------------------------------------------------------ +// 2D transpose tensor contract y +//------------------------------------------------------------------------------ +template +inline __device__ void ContractTransposeY2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, + const CeedScalar *B, CeedScalar *V) { + __syncthreads(); + data.slice[t_id_x + t_id_y * T_1D] = *U; + __syncthreads(); + *V = 0.0; + if (t_id_x < Q_1D && t_id_y < P_1D) { + for (CeedInt i = 0; i < Q_1D; i++) { + *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D]; // Contract y direction + } + } +} + +//------------------------------------------------------------------------------ +// 2D transpose tensor contract x +//------------------------------------------------------------------------------ +template +inline __device__ void ContractTransposeX2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, + const CeedScalar *B, CeedScalar *V) { + __syncthreads(); + data.slice[t_id_x + t_id_y * T_1D] = *U; + __syncthreads(); + *V = 0.0; + if (t_id_x < P_1D && t_id_y < P_1D) { + for (CeedInt i = 0; i < Q_1D; i++) { + *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D]; // Contract x direction + } + } +} + +//------------------------------------------------------------------------------ +// 2D transpose tensor contract and add x +//------------------------------------------------------------------------------ +template +inline __device__ void ContractTransposeAddX2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, + const CeedScalar *B, CeedScalar *V) { + __syncthreads(); + data.slice[t_id_x + t_id_y * T_1D] = *U; + __syncthreads(); + if (t_id_x < P_1D && t_id_y < P_1D) { + for (CeedInt i = 0; i < Q_1D; i++) { + *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D]; // Contract x direction + } + } +} + +//------------------------------------------------------------------------------ +// 2D pack/unpack quadrature values +//------------------------------------------------------------------------------ +template +inline __device__ void QPack2d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, CeedScalar *U) { + const CeedInt new_t_id_x = data.t_id_x % Q_1D, new_t_id_y = data.t_id_x / Q_1D; + + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + __syncthreads(); + if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice[t_id_x + t_id_y * T_1D] = U[comp]; + __syncthreads(); + U[comp] = data.t_id_x < (Q_1D * Q_1D) ? data.slice[new_t_id_x + new_t_id_y * T_1D] : 0.0; + } +} + +template +inline __device__ void QUnpack2d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, CeedScalar *U) { + const CeedInt old_t_id_x = data.t_id_x % Q_1D, old_t_id_y = data.t_id_x / Q_1D; + + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + __syncthreads(); + if (data.t_id_x < (Q_1D * Q_1D)) data.slice[old_t_id_x + old_t_id_y * T_1D] = U[comp]; + __syncthreads(); + U[comp] = (t_id_x < Q_1D && t_id_y < Q_1D) ? data.slice[t_id_x + t_id_y * T_1D] : 0.0; + } +} + +//------------------------------------------------------------------------------ +// 2D interpolate to quadrature points +//------------------------------------------------------------------------------ +template +inline __device__ void InterpTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, + CeedScalar *__restrict__ r_V) { + const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D; + CeedScalar r_t[1]; + + if (P_1D != T_1D) QUnpack2d(data, t_id_x, t_id_y, r_U); + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + ContractX2dFlattened(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t); + ContractY2dFlattened(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]); + } + __syncthreads(); + if (P_1D != T_1D) QPack2d(data, t_id_x, t_id_y, r_U); + if (Q_1D != T_1D) QPack2d(data, t_id_x, t_id_y, r_V); +} + +//------------------------------------------------------------------------------ +// 2D interpolate transpose +//------------------------------------------------------------------------------ +template +inline __device__ void InterpTransposeTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, + CeedScalar *__restrict__ r_V) { + const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D; + CeedScalar r_t[1]; + + if (Q_1D != T_1D) QUnpack2d(data, t_id_x, t_id_y, r_U); + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + ContractTransposeY2dFlattened(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t); + ContractTransposeX2dFlattened(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]); + } + __syncthreads(); + if (P_1D != T_1D) QPack2d(data, t_id_x, t_id_y, r_V); +} + +//------------------------------------------------------------------------------ +// 2D interpolate to quadrature points, nodes and quadrature points collocated +//------------------------------------------------------------------------------ +template +inline __device__ void InterpTensorCollocatedNodes2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, + CeedScalar *__restrict__ r_V) { + const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D; + + if (P_1D != T_1D) QUnpack2d(data, t_id_x, t_id_y, r_U); + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + r_V[comp] = r_U[comp]; + } + __syncthreads(); + if (P_1D != T_1D) QPack2d(data, t_id_x, t_id_y, r_U); + if (Q_1D != T_1D) QPack2d(data, t_id_x, t_id_y, r_V); +} + +//------------------------------------------------------------------------------ +// 2D interpolate transpose, nodes and quadrature points collocated +//------------------------------------------------------------------------------ +template +inline __device__ void InterpTransposeTensorCollocatedNodes2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, + CeedScalar *__restrict__ r_V) { + const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D; + + if (Q_1D != T_1D) QUnpack2d(data, t_id_x, t_id_y, r_U); + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + r_V[comp] = r_U[comp]; + } + __syncthreads(); + if (P_1D != T_1D) QPack2d(data, t_id_x, t_id_y, r_V); +} + +//------------------------------------------------------------------------------ +// 2D derivatives at quadrature points +//------------------------------------------------------------------------------ +template +inline __device__ void GradTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, + CeedScalar *__restrict__ r_V) { + const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D; + CeedScalar r_t[1]; + + if (P_1D != T_1D) QUnpack2d(data, t_id_x, t_id_y, r_U); + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + ContractX2dFlattened(data, t_id_x, t_id_y, &r_U[comp], c_G, r_t); + ContractY2dFlattened(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp + 0 * NUM_COMP]); + ContractX2dFlattened(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t); + ContractY2dFlattened(data, t_id_x, t_id_y, r_t, c_G, &r_V[comp + 1 * NUM_COMP]); + } + __syncthreads(); + if (P_1D != T_1D) QPack2d(data, t_id_x, t_id_y, r_U); + if (Q_1D != T_1D) QPack2d(data, t_id_x, t_id_y, r_V); +} + +//------------------------------------------------------------------------------ +// 2D derivatives transpose +//------------------------------------------------------------------------------ +template +inline __device__ void GradTransposeTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, + const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { + const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D; + CeedScalar r_t[1]; + + if (Q_1D != T_1D) QUnpack2d(data, t_id_x, t_id_y, r_U); + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + ContractTransposeY2dFlattened(data, t_id_x, t_id_y, &r_U[comp + 0 * NUM_COMP], c_B, r_t); + ContractTransposeX2dFlattened(data, t_id_x, t_id_y, r_t, c_G, &r_V[comp]); + ContractTransposeY2dFlattened(data, t_id_x, t_id_y, &r_U[comp + 1 * NUM_COMP], c_G, r_t); + ContractTransposeAddX2dFlattened(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]); + } + __syncthreads(); + if (P_1D != T_1D) QPack2d(data, t_id_x, t_id_y, r_V); +} + +//------------------------------------------------------------------------------ +// 2D derivatives at quadrature points, nodes and quadrature points collocated +//------------------------------------------------------------------------------ +template +inline __device__ void GradTensorCollocatedNodes2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, + const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { + const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D; + + if (P_1D != T_1D) QUnpack2d(data, t_id_x, t_id_y, r_U); + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + ContractX2dFlattened(data, t_id_x, t_id_y, &r_U[comp], c_G, &r_V[comp + 0 * NUM_COMP]); + ContractY2dFlattened(data, t_id_x, t_id_y, &r_U[comp], c_G, &r_V[comp + 1 * NUM_COMP]); + } + __syncthreads(); + if (P_1D != T_1D) QPack2d(data, t_id_x, t_id_y, r_U); + if (Q_1D != T_1D) QPack2d(data, t_id_x, t_id_y, r_V); +} + +//------------------------------------------------------------------------------ +// 2D derivatives transpose, nodes and quadrature points collocated +//------------------------------------------------------------------------------ +template +inline __device__ void GradTransposeTensorCollocatedNodes2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, + const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { + const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D; + + if (Q_1D != T_1D) QUnpack2d(data, t_id_x, t_id_y, r_U); + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + ContractTransposeY2dFlattened(data, t_id_x, t_id_y, &r_U[comp + 1 * NUM_COMP], c_G, &r_V[comp]); + ContractTransposeAddX2dFlattened(data, t_id_x, t_id_y, &r_U[comp + 0 * NUM_COMP], c_G, &r_V[comp]); + } + __syncthreads(); + if (P_1D != T_1D) QPack2d(data, t_id_x, t_id_y, r_V); +} + +//------------------------------------------------------------------------------ +// 2D quadrature weights +//------------------------------------------------------------------------------ +template +inline __device__ void WeightTensor2dFlattened(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) { + const int t_id_x = data.t_id_x % Q_1D, t_id_y = data.t_id_x / Q_1D; + + *w = (t_id_x < Q_1D && t_id_y < Q_1D) ? q_weight_1d[t_id_x] * q_weight_1d[t_id_y] : 0.0; +} + +//------------------------------------------------------------------------------ +// 3D +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ +// 3D tensor contract x +//------------------------------------------------------------------------------ +template +inline __device__ void ContractX3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U, + const CeedScalar *B, CeedScalar *V) { + __syncthreads(); + data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; + __syncthreads(); + *V = 0.0; + if (t_id_x < Q_1D && t_id_y < P_1D && t_id_z < P_1D) { + for (CeedInt i = 0; i < P_1D; i++) { + *V += B[i + t_id_x * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D]; // Contract x direction + } + } +} + +//------------------------------------------------------------------------------ +// 3D tensor contract y +//------------------------------------------------------------------------------ +template +inline __device__ void ContractY3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U, + const CeedScalar *B, CeedScalar *V) { + __syncthreads(); + data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; + __syncthreads(); + *V = 0.0; + if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < P_1D) { + for (CeedInt i = 0; i < P_1D; i++) { + *V += B[i + t_id_y * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D]; // Contract y direction + } + } +} + +//------------------------------------------------------------------------------ +// 3D tensor contract z +//------------------------------------------------------------------------------ +template +inline __device__ void ContractZ3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U, + const CeedScalar *B, CeedScalar *V) { + __syncthreads(); + data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; + __syncthreads(); + *V = 0.0; + if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < Q_1D) { + for (CeedInt i = 0; i < P_1D; i++) { + *V += B[i + t_id_z * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D]; // Contract z direction + } + } +} + +//------------------------------------------------------------------------------ +// 3D tensor contract z +//------------------------------------------------------------------------------ +template +inline __device__ void ContractTransposeZ3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U, + const CeedScalar *B, CeedScalar *V) { + __syncthreads(); + data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; + __syncthreads(); + *V = 0.0; + if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < P_1D) { + for (CeedInt i = 0; i < Q_1D; i++) { + *V += B[t_id_z + i * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D]; // Contract z direction + } + } +} + +//------------------------------------------------------------------------------ +// 3D transpose tensor contract z +//------------------------------------------------------------------------------ +template +inline __device__ void ContractTransposeAddZ3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, + const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { + __syncthreads(); + data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; + __syncthreads(); + if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < P_1D) { + for (CeedInt i = 0; i < Q_1D; i++) { + *V += B[t_id_z + i * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D]; // Contract z direction + } + } +} + +//------------------------------------------------------------------------------ +// 3D transpose tensor contract y +//------------------------------------------------------------------------------ +template +inline __device__ void ContractTransposeY3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U, + const CeedScalar *B, CeedScalar *V) { + __syncthreads(); + data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; + __syncthreads(); + *V = 0.0; + if (t_id_x < Q_1D && t_id_y < P_1D && t_id_z < P_1D) { + for (CeedInt i = 0; i < Q_1D; i++) { + *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D]; // Contract y direction + } + } +} + +//------------------------------------------------------------------------------ +// 3D transpose tensor contract y +//------------------------------------------------------------------------------ +template +inline __device__ void ContractTransposeAddY3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, + const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { + __syncthreads(); + data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; + __syncthreads(); + if (t_id_x < Q_1D && t_id_y < P_1D && t_id_z < P_1D) { + for (CeedInt i = 0; i < Q_1D; i++) { + *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D]; // Contract y direction + } + } +} + +//------------------------------------------------------------------------------ +// 3D transpose tensor contract x +//------------------------------------------------------------------------------ +template +inline __device__ void ContractTransposeX3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U, + const CeedScalar *B, CeedScalar *V) { + __syncthreads(); + data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; + __syncthreads(); + *V = 0.0; + if (t_id_x < P_1D && t_id_y < P_1D && t_id_z < P_1D) { + for (CeedInt i = 0; i < Q_1D; i++) { + *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D]; // Contract x direction + } + } +} + +//------------------------------------------------------------------------------ +// 3D transpose tensor contract add x +//------------------------------------------------------------------------------ +template +inline __device__ void ContractTransposeAddX3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, + const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { + __syncthreads(); + data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; + __syncthreads(); + if (t_id_x < P_1D && t_id_y < P_1D && t_id_z < P_1D) { + for (CeedInt i = 0; i < Q_1D; i++) { + *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D]; // Contract x direction + } + } +} + +//------------------------------------------------------------------------------ +// 3D pack/unpack quadrature values +//------------------------------------------------------------------------------ +template +inline __device__ void QPack3d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, CeedScalar *U) { + const CeedInt new_t_id_x = data.t_id_x % Q_1D, new_t_id_y = (data.t_id_x / Q_1D) % Q_1D, new_t_id_z = data.t_id_x / (Q_1D * Q_1D); + + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + __syncthreads(); + if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = U[comp]; + __syncthreads(); + U[comp] = data.t_id_x < (Q_1D * Q_1D * Q_1D) ? data.slice[new_t_id_x + new_t_id_y * T_1D + new_t_id_z * T_1D * T_1D] : 0.0; + } +} + +template +inline __device__ void QUnpack3d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, CeedScalar *U) { + const CeedInt old_t_id_x = data.t_id_x % Q_1D, old_t_id_y = (data.t_id_x / Q_1D) % Q_1D, old_t_id_z = data.t_id_x / (Q_1D * Q_1D); + + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + __syncthreads(); + if (data.t_id_x < Q_1D * Q_1D * Q_1D) data.slice[old_t_id_x + old_t_id_y * T_1D + old_t_id_z * T_1D * T_1D] = U[comp]; + __syncthreads(); + U[comp] = (t_id_x < Q_1D && t_id_y < Q_1D) ? data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] : 0.0; + } +} + +//------------------------------------------------------------------------------ +// 3D interpolate to quadrature points +//------------------------------------------------------------------------------ +template +inline __device__ void InterpTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, + CeedScalar *__restrict__ r_V) { + const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); + CeedScalar r_t1[1], r_t2[1]; + + if (P_1D != T_1D) QUnpack3d(data, t_id_x, t_id_y, t_id_z, r_U); + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + ContractX3dFlattened(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1); + ContractY3dFlattened(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2); + ContractZ3dFlattened(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]); + } + __syncthreads(); + if (P_1D != T_1D) QPack3d(data, t_id_x, t_id_y, t_id_z, r_U); + if (Q_1D != T_1D) QPack3d(data, t_id_x, t_id_y, t_id_z, r_V); +} + +//------------------------------------------------------------------------------ +// 3D interpolate transpose +//------------------------------------------------------------------------------ +template +inline __device__ void InterpTransposeTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, + CeedScalar *__restrict__ r_V) { + const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); + CeedScalar r_t1[1], r_t2[1]; + + if (Q_1D != T_1D) QUnpack3d(data, t_id_x, t_id_y, t_id_z, r_U); + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + ContractTransposeZ3dFlattened(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1); + ContractTransposeY3dFlattened(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2); + ContractTransposeX3dFlattened(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]); + } + __syncthreads(); + if (P_1D != T_1D) QPack3d(data, t_id_x, t_id_y, t_id_z, r_V); +} + +//------------------------------------------------------------------------------ +// 3D interpolate to quadrature points, nodes and quadrature points collocated +//------------------------------------------------------------------------------ +template +inline __device__ void InterpTensorCollocatedNodes3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, + CeedScalar *__restrict__ r_V) { + const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); + + if (P_1D != T_1D) QUnpack3d(data, t_id_x, t_id_y, t_id_z, r_U); + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + r_V[comp] = r_U[comp]; + } + __syncthreads(); + if (P_1D != T_1D) QPack3d(data, t_id_x, t_id_y, t_id_z, r_U); + if (Q_1D != T_1D) QPack3d(data, t_id_x, t_id_y, t_id_z, r_V); +} + +//------------------------------------------------------------------------------ +// 3D interpolate transpose, nodes and quadrature points collocated +//------------------------------------------------------------------------------ +template +inline __device__ void InterpTransposeTensorCollocatedNodes3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, + CeedScalar *__restrict__ r_V) { + const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); + + if (Q_1D != T_1D) QUnpack3d(data, t_id_x, t_id_y, t_id_z, r_U); + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + r_V[comp] = r_U[comp]; + } + __syncthreads(); + if (P_1D != T_1D) QPack3d(data, t_id_x, t_id_y, t_id_z, r_V); +} + +//------------------------------------------------------------------------------ +// 3D derivatives at quadrature points +//------------------------------------------------------------------------------ +template +inline __device__ void GradTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, + CeedScalar *__restrict__ r_V) { + const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); + CeedScalar r_t1[1], r_t2[1]; + + if (P_1D != T_1D) QUnpack3d(data, t_id_x, t_id_y, t_id_z, r_U); + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + ContractX3dFlattened(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_G, r_t1); + ContractY3dFlattened(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2); + ContractZ3dFlattened(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp + 0 * NUM_COMP]); + ContractX3dFlattened(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1); + ContractY3dFlattened(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, r_t2); + ContractZ3dFlattened(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp + 1 * NUM_COMP]); + ContractX3dFlattened(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1); + ContractY3dFlattened(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2); + ContractZ3dFlattened(data, t_id_x, t_id_y, t_id_z, r_t2, c_G, &r_V[comp + 2 * NUM_COMP]); + } + __syncthreads(); + if (P_1D != T_1D) QPack3d(data, t_id_x, t_id_y, t_id_z, r_U); + if (Q_1D != T_1D) QPack3d(data, t_id_x, t_id_y, t_id_z, r_V); +} + +//------------------------------------------------------------------------------ +// 3D derivatives transpose +//------------------------------------------------------------------------------ +template +inline __device__ void GradTransposeTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, + const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { + const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); + CeedScalar r_t1[1], r_t2[1]; + + if (Q_1D != T_1D) QUnpack3d(data, t_id_x, t_id_y, t_id_z, r_U); + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + ContractTransposeZ3dFlattened(data, &r_U[comp + 0 * NUM_COMP], c_B, r_t1); + ContractTransposeY3dFlattened(data, r_t1, c_B, r_t2); + ContractTransposeX3dFlattened(data, r_t2, c_G, &r_V[comp]); + ContractTransposeZ3dFlattened(data, &r_U[comp + 1 * NUM_COMP], c_B, r_t1); + ContractTransposeY3dFlattened(data, r_t1, c_G, r_t2); + ContractTransposeAddX3dFlattened(data, r_t2, c_B, &r_V[comp]); + ContractTransposeZ3dFlattened(data, &r_U[comp + 2 * NUM_COMP], c_G, r_t1); + ContractTransposeY3dFlattened(data, r_t1, c_B, r_t2); + ContractTransposeAddX3dFlattened(data, r_t2, c_B, &r_V[comp]); + } + __syncthreads(); + if (P_1D != T_1D) QPack3d(data, t_id_x, t_id_y, t_id_z, r_V); +} + +//------------------------------------------------------------------------------ +// 3D derivatives at quadrature points +//------------------------------------------------------------------------------ +template +inline __device__ void GradTensorCollocated3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, + const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { + const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); + CeedScalar r_t1[1], r_t2[1]; + + if (P_1D != T_1D) QUnpack3d(data, t_id_x, t_id_y, t_id_z, r_U); + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + ContractX3dFlattened(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1); + ContractY3dFlattened(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2); + ContractZ3dFlattened(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, r_t1); + ContractX3dFlattened(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 0 * NUM_COMP]); + ContractY3dFlattened(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 1 * NUM_COMP]); + ContractZ3dFlattened(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 2 * NUM_COMP]); + } + __syncthreads(); + if (P_1D != T_1D) QPack3d(data, t_id_x, t_id_y, t_id_z, r_U); + if (Q_1D != T_1D) QPack3d(data, t_id_x, t_id_y, t_id_z, r_V); +} + +//------------------------------------------------------------------------------ +// 3D derivatives transpose +//------------------------------------------------------------------------------ +template +inline __device__ void GradTransposeTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, + const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { + const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); + CeedScalar r_t1[1], r_t2[1]; + + if (Q_1D != T_1D) QUnpack3d(data, t_id_x, t_id_y, t_id_z, r_U); + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + ContractTransposeZ3dFlattened(data, &r_U[comp + 0 * NUM_COMP], c_B, r_t1); + ContractTransposeY3dFlattened(data, r_t1, c_B, r_t2); + ContractTransposeX3dFlattened(data, r_t2, c_G, &r_V[comp]); + ContractTransposeZ3dFlattened(data, &r_U[comp + 1 * NUM_COMP], c_B, r_t1); + ContractTransposeY3dFlattened(data, r_t1, c_G, r_t2); + ContractTransposeAddX3dFlattened(data, r_t2, c_B, &r_V[comp]); + ContractTransposeZ3dFlattened(data, &r_U[comp + 2 * NUM_COMP], c_G, r_t1); + ContractTransposeY3dFlattened(data, r_t1, c_B, r_t2); + ContractTransposeAddX3dFlattened(data, r_t2, c_B, &r_V[comp]); + } + __syncthreads(); + if (P_1D != T_1D) QPack3d(data, t_id_x, t_id_y, t_id_z, r_V); +} + +//------------------------------------------------------------------------------ +// 3D derivatives at quadrature points, nodes and quadrature points collocated +//------------------------------------------------------------------------------ +template +inline __device__ void GradTensorCollocatedNodes3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, + const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { + const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); + + if (P_1D != T_1D) QUnpack3d(data, t_id_x, t_id_y, t_id_z, r_U); + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + ContractX3dFlattened(data, t_id_x, t_id_y, t_id_z, r_U[comp], c_G, &r_V[comp + 0 * NUM_COMP]); + ContractY3dFlattened(data, t_id_x, t_id_y, t_id_z, r_U[comp], c_G, &r_V[comp + 1 * NUM_COMP]); + ContractZ3dFlattened(data, t_id_x, t_id_y, t_id_z, r_U[comp], c_G, &r_V[comp + 2 * NUM_COMP]); + } + __syncthreads(); + if (P_1D != T_1D) QPack3d(data, t_id_x, t_id_y, t_id_z, r_U); + if (Q_1D != T_1D) QPack3d(data, t_id_x, t_id_y, t_id_z, r_V); +} + +//------------------------------------------------------------------------------ +// 3D derivatives transpose, nodes and quadrature points collocated +//------------------------------------------------------------------------------ +template +inline __device__ void GradTransposeTensorCollocatedNodes3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, + const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { + const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); + + if (Q_1D != T_1D) QUnpack3d(data, t_id_x, t_id_y, t_id_z, r_U); + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + ContractTransposeZ3dFlattened(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 2 * NUM_COMP], c_G, &r_V[comp]); + ContractTransposeAddY3dFlattened(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 1 * NUM_COMP], c_G, &r_V[comp]); + ContractTransposeAddX3dFlattened(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 0 * NUM_COMP], c_G, &r_V[comp]); + } + __syncthreads(); + if (P_1D != T_1D) QPack3d(data, t_id_x, t_id_y, t_id_z, r_V); +} + +//------------------------------------------------------------------------------ +// 3D quadrature weights +//------------------------------------------------------------------------------ +template +inline __device__ void WeightTensor3dFlattened(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) { + const CeedInt t_id_x = data.t_id_x % Q_1D, t_id_y = (data.t_id_x / Q_1D) % Q_1D, t_id_z = data.t_id_x / (Q_1D * Q_1D); + + *w = (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < Q_1D) ? q_weight_1d[t_id_x] * q_weight_1d[t_id_y] * q_weight_1d[t_id_z] : 0.0; +} diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h index f2fde94139..dc05f100ae 100644 --- a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h +++ b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -7,8 +7,7 @@ /// @file /// Internal header for CUDA shared memory tensor product basis templates - -#include +#include //------------------------------------------------------------------------------ // 1D @@ -19,6 +18,7 @@ //------------------------------------------------------------------------------ template inline __device__ void ContractX1d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { + __syncthreads(); data.slice[data.t_id_x] = *U; __syncthreads(); *V = 0.0; @@ -27,7 +27,6 @@ inline __device__ void ContractX1d(SharedData_Cuda &data, const CeedScalar *U, c *V += B[i + data.t_id_x * P_1D] * data.slice[i]; // Contract x direction } } - __syncthreads(); } //------------------------------------------------------------------------------ @@ -35,6 +34,7 @@ inline __device__ void ContractX1d(SharedData_Cuda &data, const CeedScalar *U, c //------------------------------------------------------------------------------ template inline __device__ void ContractTransposeX1d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { + __syncthreads(); data.slice[data.t_id_x] = *U; __syncthreads(); *V = 0.0; @@ -43,56 +43,77 @@ inline __device__ void ContractTransposeX1d(SharedData_Cuda &data, const CeedSca *V += B[data.t_id_x + i * P_1D] * data.slice[i]; // Contract x direction } } - __syncthreads(); } //------------------------------------------------------------------------------ // 1D interpolate to quadrature points //------------------------------------------------------------------------------ -template +template inline __device__ void Interp1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { for (CeedInt comp = 0; comp < NUM_COMP; comp++) { - ContractX1d(data, r_U + comp, c_B, r_V + comp); + ContractX1d(data, &r_U[comp], c_B, &r_V[comp]); } } //------------------------------------------------------------------------------ // 1D interpolate transpose //------------------------------------------------------------------------------ -template +template inline __device__ void InterpTranspose1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { for (CeedInt comp = 0; comp < NUM_COMP; comp++) { - ContractTransposeX1d(data, r_U + comp, c_B, r_V + comp); + ContractTransposeX1d(data, &r_U[comp], c_B, &r_V[comp]); + } +} + +//------------------------------------------------------------------------------ +// 1D interpolate to quadrature points, nodes and quadrature points collocated +//------------------------------------------------------------------------------ +template +inline __device__ void InterpCollocatedNodes1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, + CeedScalar *__restrict__ r_V) { + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + r_V[comp] = r_U[comp]; + } +} + +//------------------------------------------------------------------------------ +// 1D interpolate transpose, nodes and quadrature points collocated +//------------------------------------------------------------------------------ +template +inline __device__ void InterpTransposeCollocatedNodes1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, + CeedScalar *__restrict__ r_V) { + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + r_V[comp] = r_U[comp]; } } //------------------------------------------------------------------------------ // 1D derivatives at quadrature points //------------------------------------------------------------------------------ -template +template inline __device__ void Grad1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { for (CeedInt comp = 0; comp < NUM_COMP; comp++) { - ContractX1d(data, r_U + comp, c_G, r_V + comp); + ContractX1d(data, &r_U[comp], c_G, &r_V[comp]); } } //------------------------------------------------------------------------------ // 1D derivatives transpose //------------------------------------------------------------------------------ -template +template inline __device__ void GradTranspose1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { for (CeedInt comp = 0; comp < NUM_COMP; comp++) { - ContractTransposeX1d(data, r_U + comp, c_G, r_V + comp); + ContractTransposeX1d(data, &r_U[comp], c_G, &r_V[comp]); } } //------------------------------------------------------------------------------ // 1D quadrature weights //------------------------------------------------------------------------------ -template +template inline __device__ void Weight1d(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) { *w = (data.t_id_x < Q_1D) ? q_weight_1d[data.t_id_x] : 0.0; } @@ -104,8 +125,9 @@ inline __device__ void Weight1d(SharedData_Cuda &data, const CeedScalar *__restr //------------------------------------------------------------------------------ // 2D tensor contraction x //------------------------------------------------------------------------------ -template +template inline __device__ void ContractX2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { + __syncthreads(); data.slice[data.t_id_x + data.t_id_y * T_1D] = *U; __syncthreads(); *V = 0.0; @@ -114,14 +136,14 @@ inline __device__ void ContractX2d(SharedData_Cuda &data, const CeedScalar *U, c *V += B[i + data.t_id_x * P_1D] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction } } - __syncthreads(); } //------------------------------------------------------------------------------ // 2D tensor contract y //------------------------------------------------------------------------------ -template +template inline __device__ void ContractY2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { + __syncthreads(); data.slice[data.t_id_x + data.t_id_y * T_1D] = *U; __syncthreads(); *V = 0.0; @@ -130,14 +152,14 @@ inline __device__ void ContractY2d(SharedData_Cuda &data, const CeedScalar *U, c *V += B[i + data.t_id_y * P_1D] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction } } - __syncthreads(); } //------------------------------------------------------------------------------ // 2D transpose tensor contract y //------------------------------------------------------------------------------ -template +template inline __device__ void ContractTransposeY2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { + __syncthreads(); data.slice[data.t_id_x + data.t_id_y * T_1D] = *U; __syncthreads(); *V = 0.0; @@ -146,14 +168,14 @@ inline __device__ void ContractTransposeY2d(SharedData_Cuda &data, const CeedSca *V += B[data.t_id_y + i * P_1D] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction } } - __syncthreads(); } //------------------------------------------------------------------------------ // 2D transpose tensor contract x //------------------------------------------------------------------------------ -template +template inline __device__ void ContractTransposeX2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { + __syncthreads(); data.slice[data.t_id_x + data.t_id_y * T_1D] = *U; __syncthreads(); *V = 0.0; @@ -162,14 +184,14 @@ inline __device__ void ContractTransposeX2d(SharedData_Cuda &data, const CeedSca *V += B[data.t_id_x + i * P_1D] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction } } - __syncthreads(); } //------------------------------------------------------------------------------ // 2D transpose tensor contract and add x //------------------------------------------------------------------------------ -template +template inline __device__ void ContractTransposeAddX2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { + __syncthreads(); data.slice[data.t_id_x + data.t_id_y * T_1D] = *U; __syncthreads(); if (data.t_id_x < P_1D && data.t_id_y < P_1D) { @@ -177,69 +199,114 @@ inline __device__ void ContractTransposeAddX2d(SharedData_Cuda &data, const Ceed *V += B[data.t_id_x + i * P_1D] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction } } - __syncthreads(); } //------------------------------------------------------------------------------ // 2D interpolate to quadrature points //------------------------------------------------------------------------------ -template +template inline __device__ void InterpTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { CeedScalar r_t[1]; for (CeedInt comp = 0; comp < NUM_COMP; comp++) { - ContractX2d(data, r_U + comp, c_B, r_t); - ContractY2d(data, r_t, c_B, r_V + comp); + ContractX2d(data, &r_U[comp], c_B, r_t); + ContractY2d(data, r_t, c_B, &r_V[comp]); } } //------------------------------------------------------------------------------ // 2D interpolate transpose //------------------------------------------------------------------------------ -template +template inline __device__ void InterpTransposeTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { CeedScalar r_t[1]; for (CeedInt comp = 0; comp < NUM_COMP; comp++) { - ContractTransposeY2d(data, r_U + comp, c_B, r_t); - ContractTransposeX2d(data, r_t, c_B, r_V + comp); + ContractTransposeY2d(data, &r_U[comp], c_B, r_t); + ContractTransposeX2d(data, r_t, c_B, &r_V[comp]); + } +} + +//------------------------------------------------------------------------------ +// 2D interpolate to quadrature points, nodes and quadrature points collocated +//------------------------------------------------------------------------------ +template +inline __device__ void InterpTensorCollocatedNodes2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, + CeedScalar *__restrict__ r_V) { + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + r_V[comp] = r_U[comp]; + } +} + +//------------------------------------------------------------------------------ +// 2D interpolate transpose, nodes and quadrature points collocated +//------------------------------------------------------------------------------ +template +inline __device__ void InterpTransposeTensorCollocatedNodes2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, + CeedScalar *__restrict__ r_V) { + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + r_V[comp] = r_U[comp]; } } //------------------------------------------------------------------------------ // 2D derivatives at quadrature points //------------------------------------------------------------------------------ -template +template inline __device__ void GradTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { CeedScalar r_t[1]; for (CeedInt comp = 0; comp < NUM_COMP; comp++) { - ContractX2d(data, r_U + comp, c_G, r_t); - ContractY2d(data, r_t, c_B, r_V + comp + 0 * NUM_COMP); - ContractX2d(data, r_U + comp, c_B, r_t); - ContractY2d(data, r_t, c_G, r_V + comp + 1 * NUM_COMP); + ContractX2d(data, &r_U[comp], c_G, r_t); + ContractY2d(data, r_t, c_B, &r_V[comp + 0 * NUM_COMP]); + ContractX2d(data, &r_U[comp], c_B, r_t); + ContractY2d(data, r_t, c_G, &r_V[comp + 1 * NUM_COMP]); } } //------------------------------------------------------------------------------ // 2D derivatives transpose //------------------------------------------------------------------------------ -template +template inline __device__ void GradTransposeTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { CeedScalar r_t[1]; for (CeedInt comp = 0; comp < NUM_COMP; comp++) { - ContractTransposeY2d(data, r_U + comp + 0 * NUM_COMP, c_B, r_t); - ContractTransposeX2d(data, r_t, c_G, r_V + comp); - ContractTransposeY2d(data, r_U + comp + 1 * NUM_COMP, c_G, r_t); - ContractTransposeAddX2d(data, r_t, c_B, r_V + comp); + ContractTransposeY2d(data, &r_U[comp + 0 * NUM_COMP], c_B, r_t); + ContractTransposeX2d(data, r_t, c_G, &r_V[comp]); + ContractTransposeY2d(data, &r_U[comp + 1 * NUM_COMP], c_G, r_t); + ContractTransposeAddX2d(data, r_t, c_B, &r_V[comp]); + } +} + +//------------------------------------------------------------------------------ +// 2D derivatives at quadrature points, nodes and quadrature points collocated +//------------------------------------------------------------------------------ +template +inline __device__ void GradTensorCollocatedNodes2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, + const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + ContractX2d(data, &r_U[comp], c_G, &r_V[comp + 0 * NUM_COMP]); + ContractY2d(data, &r_U[comp], c_G, &r_V[comp + 1 * NUM_COMP]); + } +} + +//------------------------------------------------------------------------------ +// 2D derivatives transpose, nodes and quadrature points collocated +//------------------------------------------------------------------------------ +template +inline __device__ void GradTransposeTensorCollocatedNodes2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, + const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + ContractTransposeY2d(data, &r_U[comp + 1 * NUM_COMP], c_G, &r_V[comp]); + ContractTransposeAddX2d(data, &r_U[comp + 0 * NUM_COMP], c_G, &r_V[comp]); } } //------------------------------------------------------------------------------ // 2D quadrature weights //------------------------------------------------------------------------------ -template +template inline __device__ void WeightTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) { *w = (data.t_id_x < Q_1D && data.t_id_y < Q_1D) ? q_weight_1d[data.t_id_x] * q_weight_1d[data.t_id_y] : 0.0; } @@ -251,7 +318,7 @@ inline __device__ void WeightTensor2d(SharedData_Cuda &data, const CeedScalar *_ //------------------------------------------------------------------------------ // 3D tensor contract x //------------------------------------------------------------------------------ -template +template inline __device__ void ContractX3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { CeedScalar r_B[P_1D]; for (CeedInt i = 0; i < P_1D; i++) { @@ -259,6 +326,7 @@ inline __device__ void ContractX3d(SharedData_Cuda &data, const CeedScalar *U, c } for (CeedInt k = 0; k < P_1D; k++) { + __syncthreads(); data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k]; __syncthreads(); V[k] = 0.0; @@ -267,14 +335,13 @@ inline __device__ void ContractX3d(SharedData_Cuda &data, const CeedScalar *U, c V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction } } - __syncthreads(); } } //------------------------------------------------------------------------------ // 3D tensor contract y //------------------------------------------------------------------------------ -template +template inline __device__ void ContractY3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { CeedScalar r_B[P_1D]; for (CeedInt i = 0; i < P_1D; i++) { @@ -282,6 +349,7 @@ inline __device__ void ContractY3d(SharedData_Cuda &data, const CeedScalar *U, c } for (CeedInt k = 0; k < P_1D; k++) { + __syncthreads(); data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k]; __syncthreads(); V[k] = 0.0; @@ -290,14 +358,13 @@ inline __device__ void ContractY3d(SharedData_Cuda &data, const CeedScalar *U, c V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction } } - __syncthreads(); } } //------------------------------------------------------------------------------ // 3D tensor contract z //------------------------------------------------------------------------------ -template +template inline __device__ void ContractZ3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { for (CeedInt k = 0; k < Q_1D; k++) { V[k] = 0.0; @@ -312,7 +379,7 @@ inline __device__ void ContractZ3d(SharedData_Cuda &data, const CeedScalar *U, c //------------------------------------------------------------------------------ // 3D transpose tensor contract z //------------------------------------------------------------------------------ -template +template inline __device__ void ContractTransposeZ3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { for (CeedInt k = 0; k < P_1D; k++) { V[k] = 0.0; @@ -327,7 +394,7 @@ inline __device__ void ContractTransposeZ3d(SharedData_Cuda &data, const CeedSca //------------------------------------------------------------------------------ // 3D transpose tensor contract y //------------------------------------------------------------------------------ -template +template inline __device__ void ContractTransposeY3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { CeedScalar r_B[Q_1D]; for (CeedInt i = 0; i < Q_1D; i++) { @@ -335,6 +402,7 @@ inline __device__ void ContractTransposeY3d(SharedData_Cuda &data, const CeedSca } for (CeedInt k = 0; k < P_1D; k++) { + __syncthreads(); data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k]; __syncthreads(); V[k] = 0.0; @@ -343,14 +411,13 @@ inline __device__ void ContractTransposeY3d(SharedData_Cuda &data, const CeedSca V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction } } - __syncthreads(); } } //------------------------------------------------------------------------------ // 3D transpose tensor contract y //------------------------------------------------------------------------------ -template +template inline __device__ void ContractTransposeAddY3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { CeedScalar r_B[Q_1D]; for (CeedInt i = 0; i < Q_1D; i++) { @@ -358,6 +425,7 @@ inline __device__ void ContractTransposeAddY3d(SharedData_Cuda &data, const Ceed } for (CeedInt k = 0; k < P_1D; k++) { + __syncthreads(); data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k]; __syncthreads(); if (data.t_id_x < Q_1D && data.t_id_y < P_1D) { @@ -365,14 +433,13 @@ inline __device__ void ContractTransposeAddY3d(SharedData_Cuda &data, const Ceed V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction } } - __syncthreads(); } } //------------------------------------------------------------------------------ // 3D transpose tensor contract x //------------------------------------------------------------------------------ -template +template inline __device__ void ContractTransposeX3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { CeedScalar r_B[Q_1D]; for (CeedInt i = 0; i < Q_1D; i++) { @@ -380,6 +447,7 @@ inline __device__ void ContractTransposeX3d(SharedData_Cuda &data, const CeedSca } for (CeedInt k = 0; k < P_1D; k++) { + __syncthreads(); data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k]; __syncthreads(); V[k] = 0.0; @@ -388,14 +456,13 @@ inline __device__ void ContractTransposeX3d(SharedData_Cuda &data, const CeedSca V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction } } - __syncthreads(); } } //------------------------------------------------------------------------------ // 3D transpose tensor contract add x //------------------------------------------------------------------------------ -template +template inline __device__ void ContractTransposeAddX3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { CeedScalar r_B[Q_1D]; for (CeedInt i = 0; i < Q_1D; i++) { @@ -403,6 +470,7 @@ inline __device__ void ContractTransposeAddX3d(SharedData_Cuda &data, const Ceed } for (CeedInt k = 0; k < P_1D; k++) { + __syncthreads(); data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k]; __syncthreads(); if (data.t_id_x < P_1D && data.t_id_y < P_1D) { @@ -410,122 +478,173 @@ inline __device__ void ContractTransposeAddX3d(SharedData_Cuda &data, const Ceed V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction } } - __syncthreads(); } } //------------------------------------------------------------------------------ // 3D interpolate to quadrature points //------------------------------------------------------------------------------ -template +template inline __device__ void InterpTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { CeedScalar r_t1[T_1D]; CeedScalar r_t2[T_1D]; for (CeedInt comp = 0; comp < NUM_COMP; comp++) { - ContractX3d(data, r_U + comp * P_1D, c_B, r_t1); - ContractY3d(data, r_t1, c_B, r_t2); - ContractZ3d(data, r_t2, c_B, r_V + comp * Q_1D); + ContractX3d(data, &r_U[comp * P_1D], c_B, r_t1); + ContractY3d(data, r_t1, c_B, r_t2); + ContractZ3d(data, r_t2, c_B, &r_V[comp * Q_1D]); } } //------------------------------------------------------------------------------ // 3D interpolate transpose //------------------------------------------------------------------------------ -template +template inline __device__ void InterpTransposeTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { CeedScalar r_t1[T_1D]; CeedScalar r_t2[T_1D]; for (CeedInt comp = 0; comp < NUM_COMP; comp++) { - ContractTransposeZ3d(data, r_U + comp * Q_1D, c_B, r_t1); - ContractTransposeY3d(data, r_t1, c_B, r_t2); - ContractTransposeX3d(data, r_t2, c_B, r_V + comp * P_1D); + ContractTransposeZ3d(data, &r_U[comp * Q_1D], c_B, r_t1); + ContractTransposeY3d(data, r_t1, c_B, r_t2); + ContractTransposeX3d(data, r_t2, c_B, &r_V[comp * P_1D]); + } +} + +//------------------------------------------------------------------------------ +// 3D interpolate to quadrature points, nodes and quadrature points collocated +//------------------------------------------------------------------------------ +template +inline __device__ void InterpTensorCollocatedNodes3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, + CeedScalar *__restrict__ r_V) { + for (CeedInt i = 0; i < Q_1D; i++) { + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + r_V[i + comp * Q_1D] = r_U[i + comp * P_1D]; + } + } +} + +//------------------------------------------------------------------------------ +// 3D interpolate transpose, nodes and quadrature points collocated +//------------------------------------------------------------------------------ +template +inline __device__ void InterpTransposeTensorCollocatedNodes3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, + CeedScalar *__restrict__ r_V) { + for (CeedInt i = 0; i < Q_1D; i++) { + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + r_V[i + comp * P_1D] = r_U[i + comp * Q_1D]; + } } } //------------------------------------------------------------------------------ // 3D derivatives at quadrature points //------------------------------------------------------------------------------ -template +template inline __device__ void GradTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { CeedScalar r_t1[T_1D]; CeedScalar r_t2[T_1D]; for (CeedInt comp = 0; comp < NUM_COMP; comp++) { - ContractX3d(data, r_U + comp * P_1D, c_G, r_t1); - ContractY3d(data, r_t1, c_B, r_t2); - ContractZ3d(data, r_t2, c_B, r_V + comp * Q_1D + 0 * NUM_COMP * Q_1D); - ContractX3d(data, r_U + comp * P_1D, c_B, r_t1); - ContractY3d(data, r_t1, c_G, r_t2); - ContractZ3d(data, r_t2, c_B, r_V + comp * Q_1D + 1 * NUM_COMP * Q_1D); - ContractX3d(data, r_U + comp * P_1D, c_B, r_t1); - ContractY3d(data, r_t1, c_B, r_t2); - ContractZ3d(data, r_t2, c_G, r_V + comp * Q_1D + 2 * NUM_COMP * Q_1D); + ContractX3d(data, &r_U[comp * P_1D], c_G, r_t1); + ContractY3d(data, r_t1, c_B, r_t2); + ContractZ3d(data, r_t2, c_B, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]); + ContractX3d(data, &r_U[comp * P_1D], c_B, r_t1); + ContractY3d(data, r_t1, c_G, r_t2); + ContractZ3d(data, r_t2, c_B, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]); + ContractX3d(data, &r_U[comp * P_1D], c_B, r_t1); + ContractY3d(data, r_t1, c_B, r_t2); + ContractZ3d(data, r_t2, c_G, &r_V[comp * Q_1D + 2 * NUM_COMP * Q_1D]); } } //------------------------------------------------------------------------------ // 3D derivatives transpose //------------------------------------------------------------------------------ -template +template inline __device__ void GradTransposeTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { CeedScalar r_t1[T_1D]; CeedScalar r_t2[T_1D]; for (CeedInt comp = 0; comp < NUM_COMP; comp++) { - ContractTransposeZ3d(data, r_U + comp * Q_1D + 0 * NUM_COMP * Q_1D, c_B, r_t1); - ContractTransposeY3d(data, r_t1, c_B, r_t2); - ContractTransposeX3d(data, r_t2, c_G, r_V + comp * P_1D); - ContractTransposeZ3d(data, r_U + comp * Q_1D + 1 * NUM_COMP * Q_1D, c_B, r_t1); - ContractTransposeY3d(data, r_t1, c_G, r_t2); - ContractTransposeAddX3d(data, r_t2, c_B, r_V + comp * P_1D); - ContractTransposeZ3d(data, r_U + comp * Q_1D + 2 * NUM_COMP * Q_1D, c_G, r_t1); - ContractTransposeY3d(data, r_t1, c_B, r_t2); - ContractTransposeAddX3d(data, r_t2, c_B, r_V + comp * P_1D); + ContractTransposeZ3d(data, &r_U[comp * Q_1D + 0 * NUM_COMP * Q_1D], c_B, r_t1); + ContractTransposeY3d(data, r_t1, c_B, r_t2); + ContractTransposeX3d(data, r_t2, c_G, &r_V[comp * P_1D]); + ContractTransposeZ3d(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_B, r_t1); + ContractTransposeY3d(data, r_t1, c_G, r_t2); + ContractTransposeAddX3d(data, r_t2, c_B, &r_V[comp * P_1D]); + ContractTransposeZ3d(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, r_t1); + ContractTransposeY3d(data, r_t1, c_B, r_t2); + ContractTransposeAddX3d(data, r_t2, c_B, &r_V[comp * P_1D]); } } //------------------------------------------------------------------------------ // 3D derivatives at quadrature points //------------------------------------------------------------------------------ -template +template inline __device__ void GradTensorCollocated3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { CeedScalar r_t1[T_1D]; CeedScalar r_t2[T_1D]; for (CeedInt comp = 0; comp < NUM_COMP; comp++) { - ContractX3d(data, r_U + comp * P_1D, c_B, r_t1); - ContractY3d(data, r_t1, c_B, r_t2); - ContractZ3d(data, r_t2, c_B, r_t1); - ContractX3d(data, r_t1, c_G, r_V + comp * Q_1D + 0 * NUM_COMP * Q_1D); - ContractY3d(data, r_t1, c_G, r_V + comp * Q_1D + 1 * NUM_COMP * Q_1D); - ContractZ3d(data, r_t1, c_G, r_V + comp * Q_1D + 2 * NUM_COMP * Q_1D); + ContractX3d(data, &r_U[comp * P_1D], c_B, r_t1); + ContractY3d(data, r_t1, c_B, r_t2); + ContractZ3d(data, r_t2, c_B, r_t1); + ContractX3d(data, r_t1, c_G, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]); + ContractY3d(data, r_t1, c_G, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]); + ContractZ3d(data, r_t1, c_G, &r_V[comp * Q_1D + 2 * NUM_COMP * Q_1D]); } } //------------------------------------------------------------------------------ // 3D derivatives transpose //------------------------------------------------------------------------------ -template +template inline __device__ void GradTransposeTensorCollocated3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { CeedScalar r_t1[T_1D]; CeedScalar r_t2[T_1D]; for (CeedInt comp = 0; comp < NUM_COMP; comp++) { - ContractTransposeZ3d(data, r_U + comp * Q_1D + 2 * NUM_COMP * Q_1D, c_G, r_t2); - ContractTransposeAddY3d(data, r_U + comp * Q_1D + 1 * NUM_COMP * Q_1D, c_G, r_t2); - ContractTransposeAddX3d(data, r_U + comp * Q_1D + 0 * NUM_COMP * Q_1D, c_G, r_t2); - ContractTransposeZ3d(data, r_t2, c_B, r_t1); - ContractTransposeY3d(data, r_t1, c_B, r_t2); - ContractTransposeX3d(data, r_t2, c_B, r_V + comp * P_1D); + ContractTransposeZ3d(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, r_t2); + ContractTransposeAddY3d(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_G, r_t2); + ContractTransposeAddX3d(data, &r_U[comp * Q_1D + 0 * NUM_COMP * Q_1D], c_G, r_t2); + ContractTransposeZ3d(data, r_t2, c_B, r_t1); + ContractTransposeY3d(data, r_t1, c_B, r_t2); + ContractTransposeX3d(data, r_t2, c_B, &r_V[comp * P_1D]); + } +} + +//------------------------------------------------------------------------------ +// 3D derivatives at quadrature points, nodes and quadrature points collocated +//------------------------------------------------------------------------------ +template +inline __device__ void GradTensorCollocatedNodes3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, + const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + ContractX3d(data, &r_U[comp * P_1D], c_G, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]); + ContractY3d(data, &r_U[comp * P_1D], c_G, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]); + ContractZ3d(data, &r_U[comp * P_1D], c_G, &r_V[comp * Q_1D + 2 * NUM_COMP * Q_1D]); + } +} + +//------------------------------------------------------------------------------ +// 3D derivatives transpose, nodes and quadrature points collocated +//------------------------------------------------------------------------------ +template +inline __device__ void GradTransposeTensorCollocatedNodes3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, + const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + ContractTransposeZ3d(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, &r_V[comp * P_1D]); + ContractTransposeAddY3d(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_G, &r_V[comp * P_1D]); + ContractTransposeAddX3d(data, &r_U[comp * Q_1D + 0 * NUM_COMP * Q_1D], c_G, &r_V[comp * P_1D]); } } //------------------------------------------------------------------------------ // 3D quadrature weights //------------------------------------------------------------------------------ -template +template inline __device__ void WeightTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) { const bool quad = (data.t_id_x < Q_1D && data.t_id_y < Q_1D); const CeedScalar pw = quad ? q_weight_1d[data.t_id_x] * q_weight_1d[data.t_id_y] : 0.0; diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor.h b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor.h index d6039d3a33..ae1cdfc5c7 100644 --- a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor.h +++ b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -7,14 +7,13 @@ /// @file /// Internal header for CUDA shared memory tensor product basis - -#include +#include #include "cuda-shared-basis-read-write-templates.h" #include "cuda-shared-basis-tensor-templates.h" //------------------------------------------------------------------------------ -// Interp kernel by dim +// Interp kernels by dim //------------------------------------------------------------------------------ extern "C" __global__ void Interp(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { extern __shared__ CeedScalar slice[]; @@ -24,30 +23,66 @@ extern "C" __global__ void Interp(const CeedInt num_elem, const CeedScalar *c_B, data.t_id_y = threadIdx.y; data.t_id_z = threadIdx.z; data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; - data.slice = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1); + data.slice = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1); CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)]; CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; + // load interp_1d into shared memory + __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; + LoadMatrix(data, c_B, s_B); + __syncthreads(); + + // Apply basis element by element for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { if (BASIS_DIM == 1) { ReadElementStrided1d(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U); - Interp1d(data, r_U, c_B, r_V); + Interp1d(data, r_U, s_B, r_V); WriteElementStrided1d(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, r_V, d_V); } else if (BASIS_DIM == 2) { ReadElementStrided2d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U); - InterpTensor2d(data, r_U, c_B, r_V); + InterpTensor2d(data, r_U, s_B, r_V); WriteElementStrided2d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, r_V, d_V); } else if (BASIS_DIM == 3) { ReadElementStrided3d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U); - InterpTensor3d(data, r_U, c_B, r_V); + InterpTensor3d(data, r_U, s_B, r_V); WriteElementStrided3d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, r_V, d_V); } } } +extern "C" __global__ void InterpCollocated(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U, + CeedScalar *__restrict__ d_V) { + extern __shared__ CeedScalar slice[]; + + SharedData_Cuda data; + data.t_id_x = threadIdx.x; + data.t_id_y = threadIdx.y; + data.t_id_z = threadIdx.z; + data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; + data.slice = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1); + + CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)]; + + // Apply basis element by element + for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { + if (BASIS_DIM == 1) { + ReadElementStrided1d(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U); + WriteElementStrided1d(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, r_U, d_V); + } else if (BASIS_DIM == 2) { + ReadElementStrided2d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U); + WriteElementStrided2d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, r_U, d_V); + } else if (BASIS_DIM == 3) { + ReadElementStrided3d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, + BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U); + WriteElementStrided3d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem, + BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, r_U, d_V); + } + } +} + extern "C" __global__ void InterpTranspose(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { extern __shared__ CeedScalar slice[]; @@ -57,30 +92,135 @@ extern "C" __global__ void InterpTranspose(const CeedInt num_elem, const CeedSca data.t_id_y = threadIdx.y; data.t_id_z = threadIdx.z; data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; - data.slice = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1); + data.slice = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1); CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)]; + // load interp_1d into shared memory + __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; + LoadMatrix(data, c_B, s_B); + __syncthreads(); + + // Apply basis element by element for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { if (BASIS_DIM == 1) { ReadElementStrided1d(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U); - InterpTranspose1d(data, r_U, c_B, r_V); + InterpTranspose1d(data, r_U, s_B, r_V); WriteElementStrided1d(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V); } else if (BASIS_DIM == 2) { ReadElementStrided2d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U, r_U); - InterpTransposeTensor2d(data, r_U, c_B, r_V); + InterpTransposeTensor2d(data, r_U, s_B, r_V); WriteElementStrided2d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V); } else if (BASIS_DIM == 3) { ReadElementStrided3d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U); - InterpTransposeTensor3d(data, r_U, c_B, r_V); + InterpTransposeTensor3d(data, r_U, s_B, r_V); WriteElementStrided3d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V); } } } +extern "C" __global__ void InterpCollocatedTranspose(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U, + CeedScalar *__restrict__ d_V) { + extern __shared__ CeedScalar slice[]; + + SharedData_Cuda data; + data.t_id_x = threadIdx.x; + data.t_id_y = threadIdx.y; + data.t_id_z = threadIdx.z; + data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; + data.slice = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1); + + CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; + + // Apply basis element by element + for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { + if (BASIS_DIM == 1) { + ReadElementStrided1d(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U); + WriteElementStrided1d(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_U, d_V); + } else if (BASIS_DIM == 2) { + ReadElementStrided2d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U, r_U); + WriteElementStrided2d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_U, d_V); + } else if (BASIS_DIM == 3) { + ReadElementStrided3d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem, + BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U); + WriteElementStrided3d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, + BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_U, d_V); + } + } +} + +extern "C" __global__ void InterpTransposeAdd(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U, + CeedScalar *__restrict__ d_V) { + extern __shared__ CeedScalar slice[]; + + SharedData_Cuda data; + data.t_id_x = threadIdx.x; + data.t_id_y = threadIdx.y; + data.t_id_z = threadIdx.z; + data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; + data.slice = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1); + + CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; + CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)]; + + // load interp_1d into shared memory + __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; + LoadMatrix(data, c_B, s_B); + __syncthreads(); + + // Apply basis element by element + for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { + if (BASIS_DIM == 1) { + ReadElementStrided1d(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U); + InterpTranspose1d(data, r_U, s_B, r_V); + SumElementStrided1d(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V); + } else if (BASIS_DIM == 2) { + ReadElementStrided2d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U, r_U); + InterpTransposeTensor2d(data, r_U, s_B, r_V); + SumElementStrided2d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V); + } else if (BASIS_DIM == 3) { + ReadElementStrided3d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem, + BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U); + InterpTransposeTensor3d(data, r_U, s_B, r_V); + SumElementStrided3d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, + BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V); + } + } +} + +extern "C" __global__ void InterpCollocatedTransposeAdd(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U, + CeedScalar *__restrict__ d_V) { + extern __shared__ CeedScalar slice[]; + + SharedData_Cuda data; + data.t_id_x = threadIdx.x; + data.t_id_y = threadIdx.y; + data.t_id_z = threadIdx.z; + data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; + data.slice = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1); + + CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; + + // Apply basis element by element + for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { + if (BASIS_DIM == 1) { + ReadElementStrided1d(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U); + SumElementStrided1d(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_U, d_V); + } else if (BASIS_DIM == 2) { + ReadElementStrided2d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U, r_U); + SumElementStrided2d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_U, d_V); + } else if (BASIS_DIM == 3) { + ReadElementStrided3d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem, + BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U); + SumElementStrided3d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, + BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_U, d_V); + } + } +} + //------------------------------------------------------------------------------ // Grad kernel by dim //------------------------------------------------------------------------------ @@ -93,26 +233,74 @@ extern "C" __global__ void Grad(const CeedInt num_elem, const CeedScalar *c_B, c data.t_id_y = threadIdx.y; data.t_id_z = threadIdx.z; data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; - data.slice = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1); + data.slice = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1); CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)]; CeedScalar r_V[BASIS_NUM_COMP * BASIS_DIM * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; + // load interp_1d and grad_1d into shared memory + __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; + LoadMatrix(data, c_B, s_B); + __shared__ CeedScalar s_G[BASIS_Q_1D * (BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D)]; + LoadMatrix(data, c_G, s_G); + __syncthreads(); + + // Apply basis element by element for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { if (BASIS_DIM == 1) { ReadElementStrided1d(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U); - Grad1d(data, r_U, c_B, c_G, r_V); + Grad1d(data, r_U, s_B, s_G, r_V); WriteElementStrided1d(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, r_V, d_V); } else if (BASIS_DIM == 2) { ReadElementStrided2d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U); - GradTensor2d(data, r_U, c_B, c_G, r_V); + GradTensor2d(data, r_U, s_B, s_G, r_V); WriteElementStrided2d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, r_V, d_V); } else if (BASIS_DIM == 3) { ReadElementStrided3d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U); - if (BASIS_HAS_COLLOCATED_GRAD) GradTensorCollocated3d(data, r_U, c_B, c_G, r_V); - else GradTensor3d(data, r_U, c_B, c_G, r_V); + if (BASIS_HAS_COLLOCATED_GRAD) GradTensorCollocated3d(data, r_U, s_B, s_G, r_V); + else GradTensor3d(data, r_U, s_B, s_G, r_V); + WriteElementStrided3d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem, + BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, r_V, d_V); + } + } +} + +extern "C" __global__ void GradCollocated(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U, + CeedScalar *__restrict__ d_V) { + extern __shared__ CeedScalar slice[]; + + SharedData_Cuda data; + data.t_id_x = threadIdx.x; + data.t_id_y = threadIdx.y; + data.t_id_z = threadIdx.z; + data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; + data.slice = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1); + + CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)]; + CeedScalar r_V[BASIS_NUM_COMP * BASIS_DIM * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; + + // load grad_1d into shared memory + __shared__ CeedScalar s_G[BASIS_Q_1D * (BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D)]; + LoadMatrix(data, c_G, s_G); + __syncthreads(); + + // Apply basis element by element + for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { + if (BASIS_DIM == 1) { + ReadElementStrided1d(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U); + Grad1d(data, r_U, NULL, s_G, r_V); + WriteElementStrided1d(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, r_V, d_V); + } else if (BASIS_DIM == 2) { + ReadElementStrided2d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U); + GradTensorCollocatedNodes2d(data, r_U, NULL, s_G, r_V); + WriteElementStrided2d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, r_V, + d_V); + } else if (BASIS_DIM == 3) { + ReadElementStrided3d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, + BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U); + GradTensorCollocatedNodes3d(data, r_U, NULL, s_G, r_V); WriteElementStrided3d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, r_V, d_V); } @@ -128,32 +316,163 @@ extern "C" __global__ void GradTranspose(const CeedInt num_elem, const CeedScala data.t_id_y = threadIdx.y; data.t_id_z = threadIdx.z; data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; - data.slice = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1); + data.slice = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1); + + CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; + CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)]; + + // load interp_1d and grad_1d into shared memory + __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; + LoadMatrix(data, c_B, s_B); + __shared__ CeedScalar s_G[BASIS_Q_1D * (BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D)]; + LoadMatrix(data, c_G, s_G); + __syncthreads(); + + // Apply basis element by element + for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { + if (BASIS_DIM == 1) { + ReadElementStrided1d(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U); + GradTranspose1d(data, r_U, s_B, s_G, r_V); + WriteElementStrided1d(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V); + } else if (BASIS_DIM == 2) { + ReadElementStrided2d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U, + r_U); + GradTransposeTensor2d(data, r_U, s_B, s_G, r_V); + WriteElementStrided2d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V); + } else if (BASIS_DIM == 3) { + ReadElementStrided3d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem, + BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U); + if (BASIS_HAS_COLLOCATED_GRAD) GradTransposeTensorCollocated3d(data, r_U, s_B, s_G, r_V); + else GradTransposeTensor3d(data, r_U, s_B, s_G, r_V); + WriteElementStrided3d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, + BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V); + } + } +} + +extern "C" __global__ void GradCollocatedTranspose(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *c_G, + const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { + extern __shared__ CeedScalar slice[]; + + SharedData_Cuda data; + data.t_id_x = threadIdx.x; + data.t_id_y = threadIdx.y; + data.t_id_z = threadIdx.z; + data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; + data.slice = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1); CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)]; + // load grad_1d into shared memory + __shared__ CeedScalar s_G[BASIS_Q_1D * (BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D)]; + LoadMatrix(data, c_G, s_G); + __syncthreads(); + + // Apply basis element by element for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { if (BASIS_DIM == 1) { ReadElementStrided1d(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U); - GradTranspose1d(data, r_U, c_B, c_G, r_V); + GradTranspose1d(data, r_U, NULL, s_G, r_V); WriteElementStrided1d(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V); } else if (BASIS_DIM == 2) { ReadElementStrided2d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U, r_U); - GradTransposeTensor2d(data, r_U, c_B, c_G, r_V); + GradTransposeTensorCollocatedNodes2d(data, r_U, NULL, s_G, r_V); WriteElementStrided2d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V); } else if (BASIS_DIM == 3) { ReadElementStrided3d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U); - if (BASIS_HAS_COLLOCATED_GRAD) GradTransposeTensorCollocated3d(data, r_U, c_B, c_G, r_V); - else GradTransposeTensor3d(data, r_U, c_B, c_G, r_V); + GradTransposeTensorCollocatedNodes3d(data, r_U, NULL, s_G, r_V); WriteElementStrided3d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V); } } } +extern "C" __global__ void GradTransposeAdd(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U, + CeedScalar *__restrict__ d_V) { + extern __shared__ CeedScalar slice[]; + + SharedData_Cuda data; + data.t_id_x = threadIdx.x; + data.t_id_y = threadIdx.y; + data.t_id_z = threadIdx.z; + data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; + data.slice = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1); + + CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; + CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)]; + + // load interp_1d and grad_1d into shared memory + __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; + LoadMatrix(data, c_B, s_B); + __shared__ CeedScalar s_G[BASIS_Q_1D * (BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D)]; + LoadMatrix(data, c_G, s_G); + __syncthreads(); + + // Apply basis element by element + for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { + if (BASIS_DIM == 1) { + ReadElementStrided1d(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U); + GradTranspose1d(data, r_U, s_B, s_G, r_V); + SumElementStrided1d(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V); + } else if (BASIS_DIM == 2) { + ReadElementStrided2d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U, + r_U); + GradTransposeTensor2d(data, r_U, s_B, s_G, r_V); + SumElementStrided2d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V); + } else if (BASIS_DIM == 3) { + ReadElementStrided3d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem, + BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U); + if (BASIS_HAS_COLLOCATED_GRAD) GradTransposeTensorCollocated3d(data, r_U, s_B, s_G, r_V); + else GradTransposeTensor3d(data, r_U, s_B, s_G, r_V); + SumElementStrided3d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, + BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V); + } + } +} + +extern "C" __global__ void GradCollocatedTransposeAdd(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *c_G, + const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { + extern __shared__ CeedScalar slice[]; + + SharedData_Cuda data; + data.t_id_x = threadIdx.x; + data.t_id_y = threadIdx.y; + data.t_id_z = threadIdx.z; + data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; + data.slice = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1); + + CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; + CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)]; + + // load grad_1d into shared memory + __shared__ CeedScalar s_G[BASIS_Q_1D * (BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D)]; + LoadMatrix(data, c_G, s_G); + __syncthreads(); + + // Apply basis element by element + for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { + if (BASIS_DIM == 1) { + ReadElementStrided1d(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U); + GradTranspose1d(data, r_U, NULL, s_G, r_V); + SumElementStrided1d(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V); + } else if (BASIS_DIM == 2) { + ReadElementStrided2d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U, + r_U); + GradTransposeTensorCollocatedNodes2d(data, r_U, NULL, s_G, r_V); + SumElementStrided2d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V); + } else if (BASIS_DIM == 3) { + ReadElementStrided3d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem, + BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U); + GradTransposeTensorCollocatedNodes3d(data, r_U, NULL, s_G, r_V); + SumElementStrided3d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, + BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V); + } + } +} + //------------------------------------------------------------------------------ // Weight kernels by dim //------------------------------------------------------------------------------ @@ -165,19 +484,20 @@ extern "C" __global__ void Weight(const CeedInt num_elem, const CeedScalar *__re data.t_id_y = threadIdx.y; data.t_id_z = threadIdx.z; data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; - data.slice = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1); + data.slice = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1); CeedScalar r_W[BASIS_DIM > 2 ? BASIS_Q_1D : 1]; + // Apply basis element by element for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { if (BASIS_DIM == 1) { - Weight1d(data, q_weight_1d, r_W); + Weight1d(data, q_weight_1d, r_W); WriteElementStrided1d<1, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, r_W, d_W); } else if (BASIS_DIM == 2) { - WeightTensor2d(data, q_weight_1d, r_W); + WeightTensor2d(data, q_weight_1d, r_W); WriteElementStrided2d<1, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, r_W, d_W); } else if (BASIS_DIM == 3) { - WeightTensor3d(data, q_weight_1d, r_W); + WeightTensor3d(data, q_weight_1d, r_W); WriteElementStrided3d<1, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, r_W, d_W); } diff --git a/include/ceed/jit-source/cuda/cuda-types.h b/include/ceed/jit-source/cuda/cuda-types.h index 9863caa7e0..58b2961246 100644 --- a/include/ceed/jit-source/cuda/cuda-types.h +++ b/include/ceed/jit-source/cuda/cuda-types.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -7,8 +7,7 @@ /// @file /// Internal header for CUDA type definitions -#ifndef CEED_CUDA_TYPES_H -#define CEED_CUDA_TYPES_H +#pragma once #include @@ -24,6 +23,13 @@ typedef struct { CeedInt *outputs[CEED_CUDA_NUMBER_FIELDS]; } FieldsInt_Cuda; +typedef struct { + CeedInt num_elem; + const CeedInt *num_per_elem; + const CeedInt *indices; + const CeedScalar *coords; +} Points_Cuda; + typedef struct { CeedInt t_id_x; CeedInt t_id_y; @@ -31,5 +37,3 @@ typedef struct { CeedInt t_id; CeedScalar *slice; } SharedData_Cuda; - -#endif // CEED_CUDA_TYPES_H diff --git a/include/ceed/jit-source/gallery/ceed-identity-to-scalar.h b/include/ceed/jit-source/gallery/ceed-identity-to-scalar.h new file mode 100644 index 0000000000..5cf406fe51 --- /dev/null +++ b/include/ceed/jit-source/gallery/ceed-identity-to-scalar.h @@ -0,0 +1,22 @@ +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. +// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +// +// SPDX-License-Identifier: BSD-2-Clause +// +// This file is part of CEED: http://github.com/ceed + +/** + @brief Identity QFunction that copies first input component directly into output +**/ +#include + +CEED_QFUNCTION(IdentityScalar)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { + // in[0] is input, size (Q*size) + const CeedScalar *input = in[0]; + // out[0] is output, size (Q) + CeedScalar *output = out[0]; + + // Quadrature point loop + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { output[i] = input[i]; } // End of Quadrature Point Loop + return CEED_ERROR_SUCCESS; +} diff --git a/include/ceed/jit-source/gallery/ceed-identity.h b/include/ceed/jit-source/gallery/ceed-identity.h index 1a84718f4a..110083b372 100644 --- a/include/ceed/jit-source/gallery/ceed-identity.h +++ b/include/ceed/jit-source/gallery/ceed-identity.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -8,8 +8,7 @@ /** @brief Identity QFunction that copies inputs directly into outputs **/ - -#include +#include typedef struct { CeedInt size; @@ -27,6 +26,5 @@ CEED_QFUNCTION(Identity)(void *ctx, const CeedInt Q, const CeedScalar *const *in // Quadrature point loop CeedPragmaSIMD for (CeedInt i = 0; i < Q * size; i++) { output[i] = input[i]; } // End of Quadrature Point Loop - return CEED_ERROR_SUCCESS; } diff --git a/include/ceed/jit-source/gallery/ceed-mass1dbuild.h b/include/ceed/jit-source/gallery/ceed-mass1dbuild.h index c266beff64..d9a985a56d 100644 --- a/include/ceed/jit-source/gallery/ceed-mass1dbuild.h +++ b/include/ceed/jit-source/gallery/ceed-mass1dbuild.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -8,8 +8,7 @@ /** @brief Ceed QFunction for building the geometric data for the 1D mass matrix **/ - -#include +#include CEED_QFUNCTION(Mass1DBuild)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { // in[0] is Jacobians, size (Q) @@ -20,6 +19,5 @@ CEED_QFUNCTION(Mass1DBuild)(void *ctx, const CeedInt Q, const CeedScalar *const // Quadrature point loop CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { q_data[i] = J[i] * w[i]; } // End of Quadrature Point Loop - return CEED_ERROR_SUCCESS; } diff --git a/include/ceed/jit-source/gallery/ceed-mass2dbuild.h b/include/ceed/jit-source/gallery/ceed-mass2dbuild.h index 7e5f6fbd34..4a6946ebce 100644 --- a/include/ceed/jit-source/gallery/ceed-mass2dbuild.h +++ b/include/ceed/jit-source/gallery/ceed-mass2dbuild.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -8,8 +8,7 @@ /** @brief Ceed QFunction for building the geometric data for the 2D mass matrix **/ - -#include +#include CEED_QFUNCTION(Mass2DBuild)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { // in[0] is Jacobians with shape [2, nc=2, Q] @@ -22,6 +21,5 @@ CEED_QFUNCTION(Mass2DBuild)(void *ctx, const CeedInt Q, const CeedScalar *const CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { q_data[i] = (J[0][0][i] * J[1][1][i] - J[0][1][i] * J[1][0][i]) * w[i]; } // End of Quadrature Point Loop - return CEED_ERROR_SUCCESS; } diff --git a/include/ceed/jit-source/gallery/ceed-mass3dbuild.h b/include/ceed/jit-source/gallery/ceed-mass3dbuild.h index 71dc961215..1d7f094dba 100644 --- a/include/ceed/jit-source/gallery/ceed-mass3dbuild.h +++ b/include/ceed/jit-source/gallery/ceed-mass3dbuild.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -8,8 +8,7 @@ /** @brief Ceed QFunction for building the geometric data for the 3D mass matrix **/ - -#include +#include CEED_QFUNCTION(Mass3DBuild)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { // in[0] is Jacobians with shape [2, nc=3, Q] @@ -24,6 +23,5 @@ CEED_QFUNCTION(Mass3DBuild)(void *ctx, const CeedInt Q, const CeedScalar *const J[0][2][i] * (J[1][0][i] * J[2][1][i] - J[1][1][i] * J[2][0][i])) * w[i]; } // End of Quadrature Point Loop - return CEED_ERROR_SUCCESS; } diff --git a/include/ceed/jit-source/gallery/ceed-massapply.h b/include/ceed/jit-source/gallery/ceed-massapply.h index 8559ce8a26..41a0695e39 100644 --- a/include/ceed/jit-source/gallery/ceed-massapply.h +++ b/include/ceed/jit-source/gallery/ceed-massapply.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -8,8 +8,7 @@ /** @brief Ceed QFunction for applying the mass matrix **/ - -#include +#include CEED_QFUNCTION(MassApply)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { // in[0] is u, size (Q) @@ -20,6 +19,5 @@ CEED_QFUNCTION(MassApply)(void *ctx, const CeedInt Q, const CeedScalar *const *i // Quadrature point loop CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { v[i] = u[i] * q_data[i]; } // End of Quadrature Point Loop - return CEED_ERROR_SUCCESS; } diff --git a/include/ceed/jit-source/gallery/ceed-poisson1dapply.h b/include/ceed/jit-source/gallery/ceed-poisson1dapply.h index dc38d4f21a..d23f134eb0 100644 --- a/include/ceed/jit-source/gallery/ceed-poisson1dapply.h +++ b/include/ceed/jit-source/gallery/ceed-poisson1dapply.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -8,8 +8,7 @@ /** @brief Ceed QFunction for applying the 1D Poisson operator **/ - -#include +#include CEED_QFUNCTION(Poisson1DApply)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { // in[0] is gradient u, size (Q) @@ -21,6 +20,5 @@ CEED_QFUNCTION(Poisson1DApply)(void *ctx, const CeedInt Q, const CeedScalar *con // Quadrature point loop CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { vg[i] = ug[i] * q_data[i]; } // End of Quadrature Point Loop - return CEED_ERROR_SUCCESS; } diff --git a/include/ceed/jit-source/gallery/ceed-poisson1dbuild.h b/include/ceed/jit-source/gallery/ceed-poisson1dbuild.h index dce08aabb2..b84fa01d31 100644 --- a/include/ceed/jit-source/gallery/ceed-poisson1dbuild.h +++ b/include/ceed/jit-source/gallery/ceed-poisson1dbuild.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -8,8 +8,7 @@ /** @brief Ceed QFunction for building the geometric data for the 1D Poisson operator **/ - -#include +#include CEED_QFUNCTION(Poisson1DBuild)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { // At every quadrature point, compute w/det(J).adj(J).adj(J)^T and store @@ -24,6 +23,5 @@ CEED_QFUNCTION(Poisson1DBuild)(void *ctx, const CeedInt Q, const CeedScalar *con // Quadrature point loop CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { q_data[i] = w[i] / J[i]; } // End of Quadrature Point Loop - return CEED_ERROR_SUCCESS; } diff --git a/include/ceed/jit-source/gallery/ceed-poisson2dapply.h b/include/ceed/jit-source/gallery/ceed-poisson2dapply.h index dab64be671..62329ad1a4 100644 --- a/include/ceed/jit-source/gallery/ceed-poisson2dapply.h +++ b/include/ceed/jit-source/gallery/ceed-poisson2dapply.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -8,8 +8,7 @@ /** @brief Ceed QFunction for applying the 2D Poisson operator **/ - -#include +#include CEED_QFUNCTION(Poisson2DApply)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { // in[0] is gradient u, shape [2, nc=1, Q] @@ -35,6 +34,5 @@ CEED_QFUNCTION(Poisson2DApply)(void *ctx, const CeedInt Q, const CeedScalar *con // j = direction of vg for (CeedInt j = 0; j < dim; j++) vg[j][i] = (ug[0][i] * dXdxdXdxT[0][j] + ug[1][i] * dXdxdXdxT[1][j]); } // End of Quadrature Point Loop - return CEED_ERROR_SUCCESS; } diff --git a/include/ceed/jit-source/gallery/ceed-poisson2dbuild.h b/include/ceed/jit-source/gallery/ceed-poisson2dbuild.h index 11e15255ad..8546c304cd 100644 --- a/include/ceed/jit-source/gallery/ceed-poisson2dbuild.h +++ b/include/ceed/jit-source/gallery/ceed-poisson2dbuild.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -8,8 +8,7 @@ /** @brief Ceed QFunction for building the geometric data for the 2D Poisson operator **/ - -#include +#include CEED_QFUNCTION(Poisson2DBuild)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { // At every quadrature point, compute w/det(J).adj(J).adj(J)^T and store @@ -30,10 +29,10 @@ CEED_QFUNCTION(Poisson2DBuild)(void *ctx, const CeedInt Q, const CeedScalar *con const CeedScalar J01 = J[1][0][i]; const CeedScalar J11 = J[1][1][i]; const CeedScalar qw = w[i] / (J00 * J11 - J10 * J01); - q_data[0][i] = qw * (J01 * J01 + J11 * J11); - q_data[1][i] = qw * (J00 * J00 + J10 * J10); - q_data[2][i] = -qw * (J00 * J01 + J10 * J11); - } // End of Quadrature Point Loop + q_data[0][i] = qw * (J01 * J01 + J11 * J11); + q_data[1][i] = qw * (J00 * J00 + J10 * J10); + q_data[2][i] = -qw * (J00 * J01 + J10 * J11); + } // End of Quadrature Point Loop return CEED_ERROR_SUCCESS; } diff --git a/include/ceed/jit-source/gallery/ceed-poisson3dapply.h b/include/ceed/jit-source/gallery/ceed-poisson3dapply.h index 71e76926e7..77295c9fb8 100644 --- a/include/ceed/jit-source/gallery/ceed-poisson3dapply.h +++ b/include/ceed/jit-source/gallery/ceed-poisson3dapply.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -8,8 +8,7 @@ /** @brief Ceed QFunction for applying the geometric data for the 3D Poisson operator **/ - -#include +#include CEED_QFUNCTION(Poisson3DApply)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { // in[0] is gradient u, shape [3, nc=1, Q] @@ -37,6 +36,5 @@ CEED_QFUNCTION(Poisson3DApply)(void *ctx, const CeedInt Q, const CeedScalar *con // j = direction of vg for (CeedInt j = 0; j < dim; j++) vg[j][i] = (ug[0][i] * dXdxdXdxT[0][j] + ug[1][i] * dXdxdXdxT[1][j] + ug[2][i] * dXdxdXdxT[2][j]); } // End of Quadrature Point Loop - return CEED_ERROR_SUCCESS; } diff --git a/include/ceed/jit-source/gallery/ceed-poisson3dbuild.h b/include/ceed/jit-source/gallery/ceed-poisson3dbuild.h index 2d4e0621e4..b42bbb93f9 100644 --- a/include/ceed/jit-source/gallery/ceed-poisson3dbuild.h +++ b/include/ceed/jit-source/gallery/ceed-poisson3dbuild.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -8,8 +8,7 @@ /** @brief Ceed QFunction for building the geometric data for the 3D Poisson operator **/ - -#include +#include CEED_QFUNCTION(Poisson3DBuild)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { // At every quadrature point, compute w/det(J).adj(J).adj(J)^T and store the symmetric part of the result. @@ -47,6 +46,5 @@ CEED_QFUNCTION(Poisson3DBuild)(void *ctx, const CeedInt Q, const CeedScalar *con q_data[4][i] = qw * (A[0][0] * A[2][0] + A[0][1] * A[2][1] + A[0][2] * A[2][2]); q_data[5][i] = qw * (A[0][0] * A[1][0] + A[0][1] * A[1][1] + A[0][2] * A[1][2]); } // End of Quadrature Point Loop - return CEED_ERROR_SUCCESS; } diff --git a/include/ceed/jit-source/gallery/ceed-scale-scalar.h b/include/ceed/jit-source/gallery/ceed-scale-scalar.h new file mode 100644 index 0000000000..f70c62ec9a --- /dev/null +++ b/include/ceed/jit-source/gallery/ceed-scale-scalar.h @@ -0,0 +1,29 @@ +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. +// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +// +// SPDX-License-Identifier: BSD-2-Clause +// +// This file is part of CEED: http://github.com/ceed + +/** + @brief Scaling QFunction that scales inputs +**/ +#include + +CEED_QFUNCTION(ScaleScalar)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { + // Ctx holds field size + const CeedInt size = *(CeedInt *)ctx; + + // in[0] is input, size (Q*size) + // in[1] is scaling factor, size (Q*size) + const CeedScalar *input = in[0]; + const CeedScalar *scale = in[1]; + // out[0] is output, size (Q*size) + CeedScalar *output = out[0]; + + // Quadrature point loop + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { + for (CeedInt j = 0; j < size; j++) output[i + j * Q] = input[i + j * Q] * scale[i]; + } // End of Quadrature Point Loop + return CEED_ERROR_SUCCESS; +} diff --git a/include/ceed/jit-source/gallery/ceed-scale.h b/include/ceed/jit-source/gallery/ceed-scale.h index 1249810987..6c0157f7e2 100644 --- a/include/ceed/jit-source/gallery/ceed-scale.h +++ b/include/ceed/jit-source/gallery/ceed-scale.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -8,8 +8,7 @@ /** @brief Scaling QFunction that scales inputs **/ - -#include +#include CEED_QFUNCTION(Scale)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { // Ctx holds field size @@ -24,5 +23,5 @@ CEED_QFUNCTION(Scale)(void *ctx, const CeedInt Q, const CeedScalar *const *in, C // Quadrature point loop CeedPragmaSIMD for (CeedInt i = 0; i < Q * size; i++) { output[i] = input[i] * scale[i]; } // End of Quadrature Point Loop - return 0; + return CEED_ERROR_SUCCESS; } diff --git a/include/ceed/jit-source/gallery/ceed-vectormassapply.h b/include/ceed/jit-source/gallery/ceed-vectormassapply.h index 70a2f3e25c..adc67918f6 100644 --- a/include/ceed/jit-source/gallery/ceed-vectormassapply.h +++ b/include/ceed/jit-source/gallery/ceed-vectormassapply.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -8,8 +8,7 @@ /** @brief Ceed QFunction for applying the mass matrix on a vector system with three components **/ - -#include +#include CEED_QFUNCTION(Vector3MassApply)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { // in[0] is u, size (Q) @@ -26,6 +25,5 @@ CEED_QFUNCTION(Vector3MassApply)(void *ctx, const CeedInt Q, const CeedScalar *c v[c][i] = u[c][i] * q_data[i]; } } // End of Quadrature Point Loop - return CEED_ERROR_SUCCESS; } diff --git a/include/ceed/jit-source/gallery/ceed-vectorpoisson1dapply.h b/include/ceed/jit-source/gallery/ceed-vectorpoisson1dapply.h index e056729422..8921c348ae 100644 --- a/include/ceed/jit-source/gallery/ceed-vectorpoisson1dapply.h +++ b/include/ceed/jit-source/gallery/ceed-vectorpoisson1dapply.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -8,8 +8,7 @@ /** @brief Ceed QFunction for applying the 1D Poisson operator on a vector system with three components **/ - -#include +#include CEED_QFUNCTION(Vector3Poisson1DApply)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { // in[0] is gradient u, shape [1, nc=3, Q] @@ -26,6 +25,5 @@ CEED_QFUNCTION(Vector3Poisson1DApply)(void *ctx, const CeedInt Q, const CeedScal vg[c][i] = ug[c][i] * q_data[i]; } } // End of Quadrature Point Loop - return CEED_ERROR_SUCCESS; } diff --git a/include/ceed/jit-source/gallery/ceed-vectorpoisson2dapply.h b/include/ceed/jit-source/gallery/ceed-vectorpoisson2dapply.h index 1b56240048..12f7d73468 100644 --- a/include/ceed/jit-source/gallery/ceed-vectorpoisson2dapply.h +++ b/include/ceed/jit-source/gallery/ceed-vectorpoisson2dapply.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -8,8 +8,7 @@ /** @brief Ceed QFunction for applying the 2D Poisson operator on a vector system with three components **/ - -#include +#include CEED_QFUNCTION(Vector3Poisson2DApply)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { // in[0] is gradient u, shape [2, nc=3, Q] @@ -36,6 +35,5 @@ CEED_QFUNCTION(Vector3Poisson2DApply)(void *ctx, const CeedInt Q, const CeedScal for (CeedInt j = 0; j < dim; j++) for (CeedInt c = 0; c < num_comp; c++) vg[j][c][i] = (ug[0][c][i] * dXdxdXdxT[0][j] + ug[1][c][i] * dXdxdXdxT[1][j]); } // End of Quadrature Point Loop - return CEED_ERROR_SUCCESS; } diff --git a/include/ceed/jit-source/gallery/ceed-vectorpoisson3dapply.h b/include/ceed/jit-source/gallery/ceed-vectorpoisson3dapply.h index 9ca86dba01..634ecb01a5 100644 --- a/include/ceed/jit-source/gallery/ceed-vectorpoisson3dapply.h +++ b/include/ceed/jit-source/gallery/ceed-vectorpoisson3dapply.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -8,8 +8,7 @@ /** @brief Ceed QFunction for applying the geometric data for the 3D Poisson on a vector system with three components operator **/ - -#include +#include CEED_QFUNCTION(Vector3Poisson3DApply)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { // in[0] is gradient u, shape [3, nc=3, Q] @@ -39,6 +38,5 @@ CEED_QFUNCTION(Vector3Poisson3DApply)(void *ctx, const CeedInt Q, const CeedScal for (CeedInt c = 0; c < num_comp; c++) vg[j][c][i] = (ug[0][c][i] * dXdxdXdxT[0][j] + ug[1][c][i] * dXdxdXdxT[1][j] + ug[2][c][i] * dXdxdXdxT[2][j]); } // End of Quadrature Point Loop - return CEED_ERROR_SUCCESS; } diff --git a/include/ceed/jit-source/hip/hip-gen-templates.h b/include/ceed/jit-source/hip/hip-gen-templates.h index 812e901866..0064ec66e3 100644 --- a/include/ceed/jit-source/hip/hip-gen-templates.h +++ b/include/ceed/jit-source/hip/hip-gen-templates.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -7,30 +7,74 @@ /// @file /// Internal header for HIP backend macro and type definitions for JiT source - -#include +#include //------------------------------------------------------------------------------ // Load matrices for basis actions //------------------------------------------------------------------------------ template -inline __device__ void loadMatrix(SharedData_Hip &data, const CeedScalar *__restrict__ d_B, CeedScalar *B) { +inline __device__ void LoadMatrix(SharedData_Hip &data, const CeedScalar *__restrict__ d_B, CeedScalar *B) { for (CeedInt i = data.t_id; i < P * Q; i += blockDim.x * blockDim.y * blockDim.z) B[i] = d_B[i]; } +//------------------------------------------------------------------------------ +// AtPoints +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ +// L-vector -> single point +//------------------------------------------------------------------------------ +template +inline __device__ void ReadPoint(SharedData_Hip &data, const CeedInt elem, const CeedInt p, const CeedInt points_in_elem, + const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { + const CeedInt ind = indices[p + elem * NUM_PTS]; + + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + r_u[comp] = d_u[ind + comp * COMP_STRIDE]; + } +} + +//------------------------------------------------------------------------------ +// Single point -> L-vector +//------------------------------------------------------------------------------ +template +inline __device__ void WritePoint(SharedData_Hip &data, const CeedInt elem, const CeedInt p, const CeedInt points_in_elem, + const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_u, CeedScalar *d_u) { + if (p < points_in_elem) { + const CeedInt ind = indices[p + elem * NUM_PTS]; + + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + d_u[ind + comp * COMP_STRIDE] += r_u[comp]; + } + } +} + //------------------------------------------------------------------------------ // 1D //------------------------------------------------------------------------------ +//------------------------------------------------------------------------------ +// Set E-vector value +//------------------------------------------------------------------------------ +template +inline __device__ void SetEVecStandard1d_Single(SharedData_Hip &data, const CeedInt n, const CeedScalar value, CeedScalar *__restrict__ r_v) { + const CeedInt target_comp = n / P_1D; + const CeedInt target_node = n % P_1D; + + if (data.t_id_x == target_node) { + r_v[target_comp] = value; + } +} + //------------------------------------------------------------------------------ // L-vector -> E-vector, offsets provided //------------------------------------------------------------------------------ -template -inline __device__ void readDofsOffset1d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices, - const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) { - if (data.t_id_x < P_1d) { +template +inline __device__ void ReadLVecStandard1d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices, + const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) { + if (data.t_id_x < P_1D) { const CeedInt node = data.t_id_x; - const CeedInt ind = indices[node + elem * P_1d]; + const CeedInt ind = indices[node + elem * P_1D]; for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp]; } @@ -39,9 +83,9 @@ inline __device__ void readDofsOffset1d(SharedData_Hip &data, const CeedInt num_ //------------------------------------------------------------------------------ // L-vector -> E-vector, strided //------------------------------------------------------------------------------ -template -inline __device__ void readDofsStrided1d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) { - if (data.t_id_x < P_1d) { +template +inline __device__ void ReadLVecStrided1d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) { + if (data.t_id_x < P_1D) { const CeedInt node = data.t_id_x; const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; @@ -52,24 +96,72 @@ inline __device__ void readDofsStrided1d(SharedData_Hip &data, const CeedInt ele //------------------------------------------------------------------------------ // E-vector -> L-vector, offsets provided //------------------------------------------------------------------------------ -template -inline __device__ void writeDofsOffset1d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices, - const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) { - if (data.t_id_x < P_1d) { +template +inline __device__ void WriteLVecStandard1d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices, + const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) { + if (data.t_id_x < P_1D) { const CeedInt node = data.t_id_x; - const CeedInt ind = indices[node + elem * P_1d]; + const CeedInt ind = indices[node + elem * P_1D]; for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[comp]); } } +template +inline __device__ void WriteLVecStandard1d_Single(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt n, + const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_v, + CeedScalar *__restrict__ d_v) { + const CeedInt target_comp = n / P_1D; + const CeedInt target_node = n % P_1D; + + if (data.t_id_x == target_node) { + const CeedInt ind = indices[target_node + elem * P_1D]; + + atomicAdd(&d_v[ind + COMP_STRIDE * target_comp], r_v[target_comp]); + } +} + +//------------------------------------------------------------------------------ +// E-vector -> L-vector, full assembly +//------------------------------------------------------------------------------ +template +inline __device__ void WriteLVecStandard1d_Assembly(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt in, + const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) { + const CeedInt in_comp = in / P_1D; + const CeedInt in_node = in % P_1D; + const CeedInt e_vec_size = P_1D * NUM_COMP; + + if (data.t_id_x < P_1D) { + const CeedInt out_node = data.t_id_x; + + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + d_v[elem * e_vec_size * e_vec_size + (in_comp * NUM_COMP + comp) * P_1D * P_1D + out_node * P_1D + in_node] += r_v[comp]; + } + } +} + +//------------------------------------------------------------------------------ +// E-vector -> L-vector, Qfunction assembly +//------------------------------------------------------------------------------ +template +inline __device__ void WriteLVecStandard1d_QFAssembly(SharedData_Hip &data, const CeedInt num_elem, const CeedInt elem, const CeedInt input_offset, + const CeedInt output_offset, const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) { + if (data.t_id_x < Q_1D) { + const CeedInt ind = data.t_id_x + elem * Q_1D; + + for (CeedInt comp = 0; comp < NUM_COMP_FIELD; comp++) { + d_v[ind + (input_offset * NUM_COMP_OUT + output_offset + comp) * (Q_1D * num_elem)] = r_v[comp]; + } + } +} + //------------------------------------------------------------------------------ // E-vector -> L-vector, strided //------------------------------------------------------------------------------ -template -inline __device__ void writeDofsStrided1d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ r_v, +template +inline __device__ void WriteLVecStrided1d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) { - if (data.t_id_x < P_1d) { + if (data.t_id_x < P_1D) { const CeedInt node = data.t_id_x; const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; @@ -81,15 +173,29 @@ inline __device__ void writeDofsStrided1d(SharedData_Hip &data, const CeedInt el // 2D //------------------------------------------------------------------------------ +//------------------------------------------------------------------------------ +// Set E-vector value +//------------------------------------------------------------------------------ +template +inline __device__ void SetEVecStandard2d_Single(SharedData_Hip &data, const CeedInt n, const CeedScalar value, CeedScalar *__restrict__ r_v) { + const CeedInt target_comp = n / (P_1D * P_1D); + const CeedInt target_node_x = n % P_1D; + const CeedInt target_node_y = (n % (P_1D * P_1D)) / P_1D; + + if (data.t_id_x == target_node_x && data.t_id_y == target_node_y) { + r_v[target_comp] = value; + } +} + //------------------------------------------------------------------------------ // L-vector -> E-vector, offsets provided //------------------------------------------------------------------------------ -template -inline __device__ void readDofsOffset2d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices, - const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) { - if (data.t_id_x < P_1d && data.t_id_y < P_1d) { - const CeedInt node = data.t_id_x + data.t_id_y * P_1d; - const CeedInt ind = indices[node + elem * P_1d * P_1d]; +template +inline __device__ void ReadLVecStandard2d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices, + const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) { + if (data.t_id_x < P_1D && data.t_id_y < P_1D) { + const CeedInt node = data.t_id_x + data.t_id_y * P_1D; + const CeedInt ind = indices[node + elem * P_1D * P_1D]; for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp]; } @@ -98,10 +204,10 @@ inline __device__ void readDofsOffset2d(SharedData_Hip &data, const CeedInt num_ //------------------------------------------------------------------------------ // L-vector -> E-vector, strided //------------------------------------------------------------------------------ -template -inline __device__ void readDofsStrided2d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) { - if (data.t_id_x < P_1d && data.t_id_y < P_1d) { - const CeedInt node = data.t_id_x + data.t_id_y * P_1d; +template +inline __device__ void ReadLVecStrided2d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) { + if (data.t_id_x < P_1D && data.t_id_y < P_1D) { + const CeedInt node = data.t_id_x + data.t_id_y * P_1D; const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + comp * STRIDES_COMP]; @@ -111,25 +217,80 @@ inline __device__ void readDofsStrided2d(SharedData_Hip &data, const CeedInt ele //------------------------------------------------------------------------------ // E-vector -> L-vector, offsets provided //------------------------------------------------------------------------------ -template -inline __device__ void writeDofsOffset2d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices, - const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) { - if (data.t_id_x < P_1d && data.t_id_y < P_1d) { - const CeedInt node = data.t_id_x + data.t_id_y * P_1d; - const CeedInt ind = indices[node + elem * P_1d * P_1d]; +template +inline __device__ void WriteLVecStandard2d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices, + const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) { + if (data.t_id_x < P_1D && data.t_id_y < P_1D) { + const CeedInt node = data.t_id_x + data.t_id_y * P_1D; + const CeedInt ind = indices[node + elem * P_1D * P_1D]; for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[comp]); } } +template +inline __device__ void WriteLVecStandard2d_Single(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt n, + const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_v, + CeedScalar *__restrict__ d_v) { + const CeedInt target_comp = n / (P_1D * P_1D); + const CeedInt target_node_x = n % P_1D; + const CeedInt target_node_y = (n % (P_1D * P_1D)) / P_1D; + + if (data.t_id_x == target_node_x && data.t_id_y == target_node_y) { + const CeedInt node = data.t_id_x + data.t_id_y * P_1D; + const CeedInt ind = indices[node + elem * P_1D * P_1D]; + + atomicAdd(&d_v[ind + COMP_STRIDE * target_comp], r_v[target_comp]); + } +} + +//------------------------------------------------------------------------------ +// E-vector -> L-vector, full assembly +//------------------------------------------------------------------------------ +template +inline __device__ void WriteLVecStandard2d_Assembly(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt in, + const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) { + const CeedInt elem_size = P_1D * P_1D; + const CeedInt in_comp = in / elem_size; + const CeedInt in_node_x = in % P_1D; + const CeedInt in_node_y = (in % elem_size) / P_1D; + const CeedInt e_vec_size = elem_size * NUM_COMP; + + if (data.t_id_x < P_1D && data.t_id_y < P_1D) { + const CeedInt in_node = in_node_x + in_node_y * P_1D; + const CeedInt out_node = data.t_id_x + data.t_id_y * P_1D; + + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + const CeedInt index = (in_comp * NUM_COMP + comp) * elem_size * elem_size + out_node * elem_size + in_node; + + d_v[elem * e_vec_size * e_vec_size + index] += r_v[comp]; + } + } +} + +//------------------------------------------------------------------------------ +// E-vector -> L-vector, Qfunction assembly +//------------------------------------------------------------------------------ +template +inline __device__ void WriteLVecStandard2d_QFAssembly(SharedData_Hip &data, const CeedInt num_elem, const CeedInt elem, const CeedInt input_offset, + const CeedInt output_offset, const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) { + if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { + const CeedInt ind = (data.t_id_x + data.t_id_y * Q_1D) + elem * Q_1D * Q_1D; + + for (CeedInt comp = 0; comp < NUM_COMP_FIELD; comp++) { + d_v[ind + (input_offset * NUM_COMP_OUT + output_offset + comp) * (Q_1D * Q_1D * num_elem)] = r_v[comp]; + } + } +} + //------------------------------------------------------------------------------ // E-vector -> L-vector, strided //------------------------------------------------------------------------------ -template -inline __device__ void writeDofsStrided2d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ r_v, +template +inline __device__ void WriteLVecStrided2d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) { - if (data.t_id_x < P_1d && data.t_id_y < P_1d) { - const CeedInt node = data.t_id_x + data.t_id_y * P_1d; + if (data.t_id_x < P_1D && data.t_id_y < P_1D) { + const CeedInt node = data.t_id_x + data.t_id_y * P_1D; const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[comp]; @@ -140,51 +301,62 @@ inline __device__ void writeDofsStrided2d(SharedData_Hip &data, const CeedInt el // 3D //------------------------------------------------------------------------------ +//------------------------------------------------------------------------------ +// Set E-vector value +//------------------------------------------------------------------------------ +template +inline __device__ void SetEVecStandard3d_Single(SharedData_Hip &data, const CeedInt n, const CeedScalar value, CeedScalar *__restrict__ r_v) { + const CeedInt target_comp = n / (P_1D * P_1D * P_1D); + const CeedInt target_node_x = n % P_1D; + const CeedInt target_node_y = ((n % (P_1D * P_1D * P_1D)) / P_1D) % P_1D; + const CeedInt target_node_z = (n % (P_1D * P_1D * P_1D)) / (P_1D * P_1D); + + if (data.t_id_x == target_node_x && data.t_id_y == target_node_y) { + r_v[target_node_z + target_comp * P_1D] = value; + } +} + //------------------------------------------------------------------------------ // L-vector -> E-vector, offsets provided //------------------------------------------------------------------------------ -// TODO: remove "Dofs" and "Quads" in the following function names? -// - readDofsOffset3d -> readOffset3d ? -// - readDofsStrided3d -> readStrided3d ? -// - readSliceQuadsOffset3d -> readSliceOffset3d ? -// - readSliceQuadsStrided3d -> readSliceStrided3d ? -// - writeDofsOffset3d -> writeOffset3d ? -// - writeDofsStrided3d -> writeStrided3d ? -template -inline __device__ void readDofsOffset3d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices, - const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) { - if (data.t_id_x < P_1d && data.t_id_y < P_1d) - for (CeedInt z = 0; z < P_1d; z++) { - const CeedInt node = data.t_id_x + data.t_id_y * P_1d + z * P_1d * P_1d; - const CeedInt ind = indices[node + elem * P_1d * P_1d * P_1d]; - - for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[z + comp * P_1d] = d_u[ind + COMP_STRIDE * comp]; +template +inline __device__ void ReadLVecStandard3d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices, + const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) { + if (data.t_id_x < P_1D && data.t_id_y < P_1D) { + for (CeedInt z = 0; z < P_1D; z++) { + const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D; + const CeedInt ind = indices[node + elem * P_1D * P_1D * P_1D]; + + for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[z + comp * P_1D] = d_u[ind + COMP_STRIDE * comp]; } + } } //------------------------------------------------------------------------------ // L-vector -> E-vector, strided //------------------------------------------------------------------------------ -template -inline __device__ void readDofsStrided3d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) { - if (data.t_id_x < P_1d && data.t_id_y < P_1d) - for (CeedInt z = 0; z < P_1d; z++) { - const CeedInt node = data.t_id_x + data.t_id_y * P_1d + z * P_1d * P_1d; +template +inline __device__ void ReadLVecStrided3d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) { + if (data.t_id_x < P_1D && data.t_id_y < P_1D) { + for (CeedInt z = 0; z < P_1D; z++) { + const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D; const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; - for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[z + comp * P_1d] = d_u[ind + comp * STRIDES_COMP]; + for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[z + comp * P_1D] = d_u[ind + comp * STRIDES_COMP]; } + } } //------------------------------------------------------------------------------ // E-vector -> Q-vector, offests provided //------------------------------------------------------------------------------ -template -inline __device__ void readSliceQuadsOffset3d(SharedData_Hip &data, const CeedInt nquads, const CeedInt elem, const CeedInt q, - const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) { - if (data.t_id_x < Q_1d && data.t_id_y < Q_1d) { - const CeedInt node = data.t_id_x + data.t_id_y * Q_1d + q * Q_1d * Q_1d; - const CeedInt ind = indices[node + elem * Q_1d * Q_1d * Q_1d]; +template +inline __device__ void ReadEVecSliceStandard3d(SharedData_Hip &data, const CeedInt nquads, const CeedInt elem, const CeedInt q, + const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, + CeedScalar *__restrict__ r_u) { + if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { + const CeedInt node = data.t_id_x + data.t_id_y * Q_1D + q * Q_1D * Q_1D; + const CeedInt ind = indices[node + elem * Q_1D * Q_1D * Q_1D]; for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp]; } @@ -193,11 +365,11 @@ inline __device__ void readSliceQuadsOffset3d(SharedData_Hip &data, const CeedIn //------------------------------------------------------------------------------ // E-vector -> Q-vector, strided //------------------------------------------------------------------------------ -template -inline __device__ void readSliceQuadsStrided3d(SharedData_Hip &data, const CeedInt elem, const CeedInt q, const CeedScalar *__restrict__ d_u, - CeedScalar *__restrict__ r_u) { - if (data.t_id_x < Q_1d && data.t_id_y < Q_1d) { - const CeedInt node = data.t_id_x + data.t_id_y * Q_1d + q * Q_1d * Q_1d; +template +inline __device__ void ReadEVecSliceStrided3d(SharedData_Hip &data, const CeedInt elem, const CeedInt q, const CeedScalar *__restrict__ d_u, + CeedScalar *__restrict__ r_u) { + if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { + const CeedInt node = data.t_id_x + data.t_id_y * Q_1D + q * Q_1D * Q_1D; const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + comp * STRIDES_COMP]; @@ -207,55 +379,122 @@ inline __device__ void readSliceQuadsStrided3d(SharedData_Hip &data, const CeedI //------------------------------------------------------------------------------ // E-vector -> L-vector, offsets provided //------------------------------------------------------------------------------ -template -inline __device__ void writeDofsOffset3d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices, - const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) { - if (data.t_id_x < P_1d && data.t_id_y < P_1d) - for (CeedInt z = 0; z < P_1d; z++) { - const CeedInt node = data.t_id_x + data.t_id_y * P_1d + z * P_1d * P_1d; - const CeedInt ind = indices[node + elem * P_1d * P_1d * P_1d]; +template +inline __device__ void WriteLVecStandard3d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices, + const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) { + if (data.t_id_x < P_1D && data.t_id_y < P_1D) { + for (CeedInt z = 0; z < P_1D; z++) { + const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D; + const CeedInt ind = indices[node + elem * P_1D * P_1D * P_1D]; + + for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[z + comp * P_1D]); + } + } +} + +template +inline __device__ void WriteLVecStandard3d_Single(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt n, + const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_v, + CeedScalar *__restrict__ d_v) { + const CeedInt target_comp = n / (P_1D * P_1D * P_1D); + const CeedInt target_node_x = n % P_1D; + const CeedInt target_node_y = ((n % (P_1D * P_1D * P_1D)) / P_1D) % P_1D; + const CeedInt target_node_z = (n % (P_1D * P_1D * P_1D)) / (P_1D * P_1D); + + if (data.t_id_x == target_node_x && data.t_id_y == target_node_y) { + const CeedInt node = data.t_id_x + data.t_id_y * P_1D + target_node_z * P_1D * P_1D; + const CeedInt ind = indices[node + elem * P_1D * P_1D * P_1D]; + + atomicAdd(&d_v[ind + COMP_STRIDE * target_comp], r_v[target_node_z + target_comp * P_1D]); + } +} + +//------------------------------------------------------------------------------ +// E-vector -> L-vector, full assembly +//------------------------------------------------------------------------------ +template +inline __device__ void WriteLVecStandard3d_Assembly(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt in, + const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) { + const CeedInt elem_size = P_1D * P_1D * P_1D; + const CeedInt in_comp = in / elem_size; + const CeedInt in_node_x = in % P_1D; + const CeedInt in_node_y = (in % (P_1D * P_1D)) / P_1D; + const CeedInt in_node_z = (in % elem_size) / (P_1D * P_1D); + const CeedInt e_vec_size = elem_size * NUM_COMP; + + if (data.t_id_x < P_1D && data.t_id_y < P_1D) { + const CeedInt in_node = in_node_x + in_node_y * P_1D + in_node_z * P_1D * P_1D; + for (CeedInt z = 0; z < P_1D; z++) { + const CeedInt out_node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D; + + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + const CeedInt index = (in_comp * NUM_COMP + comp) * elem_size * elem_size + out_node * elem_size + in_node; + + d_v[elem * e_vec_size * e_vec_size + index] += r_v[z + comp * P_1D]; + } + } + } +} + +//------------------------------------------------------------------------------ +// E-vector -> L-vector, Qfunction assembly +//------------------------------------------------------------------------------ +template +inline __device__ void WriteLVecStandard3d_QFAssembly(SharedData_Hip &data, const CeedInt num_elem, const CeedInt elem, const CeedInt input_offset, + const CeedInt output_offset, const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) { + if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { + for (CeedInt z = 0; z < Q_1D; z++) { + const CeedInt ind = (data.t_id_x + data.t_id_y * Q_1D + z * Q_1D * Q_1D) + elem * Q_1D * Q_1D * Q_1D; - for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[z + comp * P_1d]); + for (CeedInt comp = 0; comp < NUM_COMP_FIELD; comp++) { + d_v[ind + (input_offset * NUM_COMP_OUT + output_offset + comp) * (Q_1D * Q_1D * Q_1D * num_elem)] = r_v[z + comp * Q_1D]; + } } + } } //------------------------------------------------------------------------------ // E-vector -> L-vector, strided //------------------------------------------------------------------------------ -template -inline __device__ void writeDofsStrided3d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ r_v, +template +inline __device__ void WriteLVecStrided3d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) { - if (data.t_id_x < P_1d && data.t_id_y < P_1d) - for (CeedInt z = 0; z < P_1d; z++) { - const CeedInt node = data.t_id_x + data.t_id_y * P_1d + z * P_1d * P_1d; + if (data.t_id_x < P_1D && data.t_id_y < P_1D) { + for (CeedInt z = 0; z < P_1D; z++) { + const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D; const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; - for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[z + comp * P_1d]; + for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[z + comp * P_1D]; } + } } //------------------------------------------------------------------------------ // 3D collocated derivatives computation //------------------------------------------------------------------------------ -template -inline __device__ void gradCollo3d(SharedData_Hip &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, - CeedScalar *__restrict__ r_V) { - if (data.t_id_x < Q_1d && data.t_id_y < Q_1d) { +template +inline __device__ void GradColloSlice3d(SharedData_Hip &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, + CeedScalar *__restrict__ r_V) { + if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { for (CeedInt comp = 0; comp < NUM_COMP; comp++) { - data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[q + comp * Q_1d]; + __syncthreads(); + data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[q + comp * Q_1D]; __syncthreads(); // X derivative r_V[comp + 0 * NUM_COMP] = 0.0; - for (CeedInt i = 0; i < Q_1d; i++) - r_V[comp + 0 * NUM_COMP] += c_G[i + data.t_id_x * Q_1d] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction (X derivative) + for (CeedInt i = 0; i < Q_1D; i++) { + r_V[comp + 0 * NUM_COMP] += c_G[i + data.t_id_x * Q_1D] * data.slice[i + data.t_id_y * T_1D]; + } // Y derivative r_V[comp + 1 * NUM_COMP] = 0.0; - for (CeedInt i = 0; i < Q_1d; i++) - r_V[comp + 1 * NUM_COMP] += c_G[i + data.t_id_y * Q_1d] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction (Y derivative) + for (CeedInt i = 0; i < Q_1D; i++) { + r_V[comp + 1 * NUM_COMP] += c_G[i + data.t_id_y * Q_1D] * data.slice[data.t_id_x + i * T_1D]; + } // Z derivative r_V[comp + 2 * NUM_COMP] = 0.0; - for (CeedInt i = 0; i < Q_1d; i++) r_V[comp + 2 * NUM_COMP] += c_G[i + q * Q_1d] * r_U[i + comp * Q_1d]; // Contract z direction (Z derivative) - __syncthreads(); + for (CeedInt i = 0; i < Q_1D; i++) { + r_V[comp + 2 * NUM_COMP] += c_G[i + q * Q_1D] * r_U[i + comp * Q_1D]; + } } } } @@ -263,26 +502,29 @@ inline __device__ void gradCollo3d(SharedData_Hip &data, const CeedInt q, const //------------------------------------------------------------------------------ // 3D collocated derivatives transpose //------------------------------------------------------------------------------ -template -inline __device__ void gradColloTranspose3d(SharedData_Hip &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, - CeedScalar *__restrict__ r_V) { - if (data.t_id_x < Q_1d && data.t_id_y < Q_1d) { +template +inline __device__ void GradColloSliceTranspose3d(SharedData_Hip &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, + CeedScalar *__restrict__ r_V) { + if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { for (CeedInt comp = 0; comp < NUM_COMP; comp++) { // X derivative - data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 0 * NUM_COMP]; __syncthreads(); - for (CeedInt i = 0; i < Q_1d; i++) - r_V[q + comp * Q_1d] += c_G[data.t_id_x + i * Q_1d] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction (X derivative) + data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 0 * NUM_COMP]; __syncthreads(); + for (CeedInt i = 0; i < Q_1D; i++) { + r_V[q + comp * Q_1D] += c_G[data.t_id_x + i * Q_1D] * data.slice[i + data.t_id_y * T_1D]; + } // Y derivative - data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 1 * NUM_COMP]; __syncthreads(); - for (CeedInt i = 0; i < Q_1d; i++) - r_V[q + comp * Q_1d] += c_G[data.t_id_y + i * Q_1d] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction (Y derivative) + data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 1 * NUM_COMP]; __syncthreads(); + for (CeedInt i = 0; i < Q_1D; i++) { + r_V[q + comp * Q_1D] += c_G[data.t_id_y + i * Q_1D] * data.slice[data.t_id_x + i * T_1D]; + } // Z derivative - for (CeedInt i = 0; i < Q_1d; i++) - r_V[i + comp * Q_1d] += c_G[i + q * Q_1d] * r_U[comp + 2 * NUM_COMP]; // PARTIAL contract z direction (Z derivative) + for (CeedInt i = 0; i < Q_1D; i++) { + r_V[i + comp * Q_1D] += c_G[i + q * Q_1D] * r_U[comp + 2 * NUM_COMP]; + } } } } diff --git a/include/ceed/jit-source/hip/hip-jit.h b/include/ceed/jit-source/hip/hip-jit.h index 2ac1968b2d..032d716828 100644 --- a/include/ceed/jit-source/hip/hip-jit.h +++ b/include/ceed/jit-source/hip/hip-jit.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause diff --git a/include/ceed/jit-source/hip/hip-ref-basis-nontensor-templates.h b/include/ceed/jit-source/hip/hip-ref-basis-nontensor-templates.h index 00b559ff10..c6b951b87a 100644 --- a/include/ceed/jit-source/hip/hip-ref-basis-nontensor-templates.h +++ b/include/ceed/jit-source/hip/hip-ref-basis-nontensor-templates.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -7,8 +7,7 @@ /// @file /// Internal header for HIP non-tensor product basis templates - -#include +#include //------------------------------------------------------------------------------ // Tensor contraction @@ -24,7 +23,7 @@ inline __device__ void Contract(const CeedInt elem, const CeedInt strides_elem_U for (CeedInt comp = 0; comp < NUM_COMP; comp++) { // Run with Q threads - U = d_U + elem * strides_elem_U + comp * strides_comp_U; + U = &d_U[elem * strides_elem_U + comp * strides_comp_U]; for (CeedInt d = 0; d < Q_COMP; d++) r_V[d] = 0.0; for (CeedInt i = 0; i < P; i++) { const CeedScalar val = U[i]; @@ -53,9 +52,9 @@ inline __device__ void ContractTranspose(const CeedInt elem, const CeedInt strid // Run with P threads r_V = 0.0; for (CeedInt d = 0; d < Q_COMP; d++) { - U = d_U + elem * strides_elem_U + comp * strides_comp_U + d * strides_q_comp_U; + U = &d_U[elem * strides_elem_U + comp * strides_comp_U + d * strides_q_comp_U]; for (CeedInt i = 0; i < Q; i++) r_V += d_B[t_id + i * P + d * P * Q] * U[i]; } - d_V[elem * strides_elem_V + comp * strides_comp_V + t_id] = r_V; + d_V[elem * strides_elem_V + comp * strides_comp_V + t_id] += r_V; } } diff --git a/include/ceed/jit-source/hip/hip-ref-basis-nontensor.h b/include/ceed/jit-source/hip/hip-ref-basis-nontensor.h index 953f6f48e3..71074a35dc 100644 --- a/include/ceed/jit-source/hip/hip-ref-basis-nontensor.h +++ b/include/ceed/jit-source/hip/hip-ref-basis-nontensor.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -7,8 +7,7 @@ /// @file /// Internal header for HIP non-tensor product basis - -#include +#include #include "hip-ref-basis-nontensor-templates.h" @@ -21,18 +20,32 @@ //------------------------------------------------------------------------------ extern "C" __global__ void Interp(const CeedInt num_elem, const CeedScalar *__restrict__ d_B, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { +#ifdef __HIP_PLATFORM_SPIRV__ + CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; + if (elem < num_elem) + Contract(elem, BASIS_P, BASIS_Q, BASIS_P * num_elem, BASIS_Q * num_elem, + BASIS_NUM_COMP * BASIS_Q * num_elem, d_B, d_U, d_V); +#else for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { Contract(elem, BASIS_P, BASIS_Q, BASIS_P * num_elem, BASIS_Q * num_elem, BASIS_NUM_COMP * BASIS_Q * num_elem, d_B, d_U, d_V); } +#endif } extern "C" __global__ void InterpTranspose(const CeedInt num_elem, const CeedScalar *__restrict__ d_B, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { +#ifdef __HIP_PLATFORM_SPIRV__ + CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; + if (elem < num_elem) + ContractTranspose(elem, BASIS_Q, BASIS_P, BASIS_Q * num_elem, BASIS_P * num_elem, + BASIS_NUM_COMP * BASIS_Q * num_elem, d_B, d_U, d_V); +#else for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { ContractTranspose(elem, BASIS_Q, BASIS_P, BASIS_Q * num_elem, BASIS_P * num_elem, BASIS_NUM_COMP * BASIS_Q * num_elem, d_B, d_U, d_V); } +#endif } //------------------------------------------------------------------------------ @@ -40,18 +53,32 @@ extern "C" __global__ void InterpTranspose(const CeedInt num_elem, const CeedSca //------------------------------------------------------------------------------ extern "C" __global__ void Deriv(const CeedInt num_elem, const CeedScalar *__restrict__ d_B, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { +#ifdef __HIP_PLATFORM_SPIRV__ + CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; + if (elem < num_elem) + Contract(elem, BASIS_P, BASIS_Q, BASIS_P * num_elem, BASIS_Q * num_elem, + BASIS_NUM_COMP * BASIS_Q * num_elem, d_B, d_U, d_V); +#else for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { Contract(elem, BASIS_P, BASIS_Q, BASIS_P * num_elem, BASIS_Q * num_elem, BASIS_NUM_COMP * BASIS_Q * num_elem, d_B, d_U, d_V); } +#endif } extern "C" __global__ void DerivTranspose(const CeedInt num_elem, const CeedScalar *__restrict__ d_B, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { +#ifdef __HIP_PLATFORM_SPIRV__ + CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; + if (elem < num_elem) + ContractTranspose(elem, BASIS_Q, BASIS_P, BASIS_Q * num_elem, BASIS_P * num_elem, + BASIS_NUM_COMP * BASIS_Q * num_elem, d_B, d_U, d_V); +#else for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { ContractTranspose(elem, BASIS_Q, BASIS_P, BASIS_Q * num_elem, BASIS_P * num_elem, BASIS_NUM_COMP * BASIS_Q * num_elem, d_B, d_U, d_V); } +#endif } //------------------------------------------------------------------------------ @@ -61,7 +88,13 @@ extern "C" __global__ void Weight(const CeedInt num_elem, const CeedScalar *__re const CeedInt t_id = threadIdx.x; // TODO load q_weight in shared memory if blockDim.z > 1? +#ifdef __HIP_PLATFORM_SPIRV__ + CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; + if (elem < num_elem) + d_V[elem * BASIS_Q + t_id] = q_weight[t_id]; +#else for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { d_V[elem * BASIS_Q + t_id] = q_weight[t_id]; } +#endif } diff --git a/include/ceed/jit-source/hip/hip-ref-basis-tensor-at-points.h b/include/ceed/jit-source/hip/hip-ref-basis-tensor-at-points.h new file mode 100644 index 0000000000..61ef0d3f0a --- /dev/null +++ b/include/ceed/jit-source/hip/hip-ref-basis-tensor-at-points.h @@ -0,0 +1,408 @@ +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. +// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +// +// SPDX-License-Identifier: BSD-2-Clause +// +// This file is part of CEED: http://github.com/ceed + +/// @file +/// Internal header for CUDA tensor product basis with AtPoints evaluation +#include + +//------------------------------------------------------------------------------ +// Chebyshev values +//------------------------------------------------------------------------------ +template +inline __device__ void ChebyshevPolynomialsAtPoint(const CeedScalar x, CeedScalar *chebyshev_x) { + chebyshev_x[0] = 1.0; + chebyshev_x[1] = 2 * x; + for (CeedInt i = 2; i < Q_1D; i++) chebyshev_x[i] = 2 * x * chebyshev_x[i - 1] - chebyshev_x[i - 2]; +} + +template +inline __device__ void ChebyshevDerivativeAtPoint(const CeedScalar x, CeedScalar *chebyshev_dx) { + CeedScalar chebyshev_x[3]; + + chebyshev_x[1] = 1.0; + chebyshev_x[2] = 2 * x; + chebyshev_dx[0] = 0.0; + chebyshev_dx[1] = 2.0; + for (CeedInt i = 2; i < Q_1D; i++) { + chebyshev_x[(i + 1) % 3] = 2 * x * chebyshev_x[(i + 0) % 3] - chebyshev_x[(i + 2) % 3]; + chebyshev_dx[i] = 2 * x * chebyshev_dx[i - 1] + 2 * chebyshev_x[(i + 0) % 3] - chebyshev_dx[i - 2]; + } +} + +//------------------------------------------------------------------------------ +// Tensor Basis Kernels AtPoints +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ +// Interp +//------------------------------------------------------------------------------ +extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ chebyshev_interp_1d, + const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ coords, + const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { + const CeedInt i = threadIdx.x; + + __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 2 * BASIS_BUF_LEN + POINTS_BUFF_LEN * BASIS_Q_1D]; + CeedScalar *s_chebyshev_interp_1d = s_mem; + CeedScalar *s_buffer_1 = s_mem + BASIS_Q_1D * BASIS_P_1D; + CeedScalar *s_buffer_2 = s_buffer_1 + BASIS_BUF_LEN; + CeedScalar *s_chebyshev_coeffs = s_buffer_2 + BASIS_BUF_LEN; + CeedScalar chebyshev_x[BASIS_Q_1D], buffer_1[POINTS_BUFF_LEN], buffer_2[POINTS_BUFF_LEN]; + for (CeedInt k = i; k < BASIS_Q_1D * BASIS_P_1D; k += blockDim.x) { + s_chebyshev_interp_1d[k] = chebyshev_interp_1d[k]; + } + + const CeedInt P = BASIS_P_1D; + const CeedInt Q = BASIS_Q_1D; + const CeedInt u_stride = BASIS_NUM_NODES; + const CeedInt v_stride = BASIS_NUM_PTS; + const CeedInt u_comp_stride = num_elem * BASIS_NUM_NODES; + const CeedInt v_comp_stride = num_elem * BASIS_NUM_PTS; + const CeedInt u_size = BASIS_NUM_NODES; + + // Apply basis element by element + for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) { + for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) { + const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride]; + CeedScalar *cur_v = &v[elem * v_stride + comp * v_comp_stride]; + CeedInt pre = u_size; + CeedInt post = 1; + + // Map to coefficients + for (CeedInt d = 0; d < BASIS_DIM; d++) { + __syncthreads(); + // Update buffers used + pre /= P; + const CeedScalar *in = d == 0 ? cur_u : (d % 2 ? s_buffer_2 : s_buffer_1); + CeedScalar *out = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_1 : s_buffer_2); + const CeedInt writeLen = pre * post * Q; + + // Contract along middle index + for (CeedInt k = i; k < writeLen; k += blockDim.x) { + const CeedInt c = k % post; + const CeedInt j = (k / post) % Q; + const CeedInt a = k / (post * Q); + CeedScalar v_k = 0; + + for (CeedInt b = 0; b < P; b++) v_k += s_chebyshev_interp_1d[j * BASIS_P_1D + b] * in[(a * P + b) * post + c]; + out[k] = v_k; + } + post *= Q; + } + + // Map to point + __syncthreads(); + for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) { + pre = BASIS_NUM_QPTS; + post = 1; + for (CeedInt d = 0; d < BASIS_DIM; d++) { + // Update buffers used + pre /= Q; + const CeedScalar *in = d == 0 ? s_chebyshev_coeffs : (d % 2 ? buffer_2 : buffer_1); + CeedScalar *out = d == BASIS_DIM - 1 ? (&cur_v[p]) : (d % 2 ? buffer_1 : buffer_2); + + // Build Chebyshev polynomial values + ChebyshevPolynomialsAtPoint(coords[elem * v_stride + d * v_comp_stride + p], chebyshev_x); + + // Contract along middle index + for (CeedInt a = 0; a < pre; a++) { + for (CeedInt c = 0; c < post; c++) { + CeedScalar v_k = 0; + + for (CeedInt b = 0; b < Q; b++) v_k += chebyshev_x[b] * in[(a * Q + b) * post + c]; + out[a * post + c] = v_k; + } + } + post *= 1; + } + } + } + } +} + +extern "C" __global__ void InterpTransposeAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ chebyshev_interp_1d, + const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ coords, + const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { + const CeedInt i = threadIdx.x; + + __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 2 * BASIS_BUF_LEN + POINTS_BUFF_LEN * BASIS_Q_1D]; + CeedScalar *s_chebyshev_interp_1d = s_mem; + CeedScalar *s_buffer_1 = s_mem + BASIS_Q_1D * BASIS_P_1D; + CeedScalar *s_buffer_2 = s_buffer_1 + BASIS_BUF_LEN; + CeedScalar *s_chebyshev_coeffs = s_buffer_2 + BASIS_BUF_LEN; + CeedScalar chebyshev_x[BASIS_Q_1D], buffer_1[POINTS_BUFF_LEN], buffer_2[POINTS_BUFF_LEN]; + for (CeedInt k = i; k < BASIS_Q_1D * BASIS_P_1D; k += blockDim.x) { + s_chebyshev_interp_1d[k] = chebyshev_interp_1d[k]; + } + + const CeedInt P = BASIS_P_1D; + const CeedInt Q = BASIS_Q_1D; + const CeedInt u_stride = BASIS_NUM_PTS; + const CeedInt v_stride = BASIS_NUM_NODES; + const CeedInt u_comp_stride = num_elem * BASIS_NUM_PTS; + const CeedInt v_comp_stride = num_elem * BASIS_NUM_NODES; + const CeedInt u_size = BASIS_NUM_PTS; + + // Apply basis element by element + for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) { + for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) { + const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride]; + CeedScalar *cur_v = &v[elem * v_stride + comp * v_comp_stride]; + CeedInt pre = 1; + CeedInt post = 1; + + // Clear Chebyshev coeffs + for (CeedInt k = i; k < BASIS_NUM_QPTS; k += blockDim.x) { + s_chebyshev_coeffs[k] = 0.0; + } + + // Map from point + __syncthreads(); + for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) { + if (p >= points_per_elem[elem]) continue; + pre = 1; + post = 1; + for (CeedInt d = 0; d < BASIS_DIM; d++) { + // Update buffers used + pre /= 1; + const CeedScalar *in = d == 0 ? (&cur_u[p]) : (d % 2 ? buffer_2 : buffer_1); + CeedScalar *out = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? buffer_1 : buffer_2); + + // Build Chebyshev polynomial values + ChebyshevPolynomialsAtPoint(coords[elem * u_stride + d * u_comp_stride + p], chebyshev_x); + + // Contract along middle index + for (CeedInt a = 0; a < pre; a++) { + for (CeedInt c = 0; c < post; c++) { + if (d == BASIS_DIM - 1) { + for (CeedInt j = 0; j < Q; j++) atomicAdd(&out[(a * Q + (j + p) % Q) * post + c], chebyshev_x[(j + p) % Q] * in[a * post + c]); + } else { + for (CeedInt j = 0; j < Q; j++) out[(a * Q + j) * post + c] = chebyshev_x[j] * in[a * post + c]; + } + } + } + post *= Q; + } + } + + // Map from coefficients + pre = BASIS_NUM_QPTS; + post = 1; + for (CeedInt d = 0; d < BASIS_DIM; d++) { + __syncthreads(); + // Update buffers used + pre /= Q; + const CeedScalar *in = d == 0 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_2 : s_buffer_1); + CeedScalar *out = d == BASIS_DIM - 1 ? cur_v : (d % 2 ? s_buffer_1 : s_buffer_2); + const CeedInt writeLen = pre * post * P; + + // Contract along middle index + for (CeedInt k = i; k < writeLen; k += blockDim.x) { + const CeedInt c = k % post; + const CeedInt j = (k / post) % P; + const CeedInt a = k / (post * P); + CeedScalar v_k = 0; + + for (CeedInt b = 0; b < Q; b++) v_k += s_chebyshev_interp_1d[j + b * BASIS_P_1D] * in[(a * Q + b) * post + c]; + if (d == BASIS_DIM - 1) out[k] += v_k; + else out[k] = v_k; + } + post *= P; + } + } + } +} + +//------------------------------------------------------------------------------ +// Grad +//------------------------------------------------------------------------------ +extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ chebyshev_interp_1d, + const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ coords, + const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { + const CeedInt i = threadIdx.x; + + __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 2 * BASIS_BUF_LEN + POINTS_BUFF_LEN * BASIS_Q_1D]; + CeedScalar *s_chebyshev_interp_1d = s_mem; + CeedScalar *s_buffer_1 = s_mem + BASIS_Q_1D * BASIS_P_1D; + CeedScalar *s_buffer_2 = s_buffer_1 + BASIS_BUF_LEN; + CeedScalar *s_chebyshev_coeffs = s_buffer_2 + BASIS_BUF_LEN; + CeedScalar chebyshev_x[BASIS_Q_1D], buffer_1[POINTS_BUFF_LEN], buffer_2[POINTS_BUFF_LEN]; + for (CeedInt k = i; k < BASIS_Q_1D * BASIS_P_1D; k += blockDim.x) { + s_chebyshev_interp_1d[k] = chebyshev_interp_1d[k]; + } + + const CeedInt P = BASIS_P_1D; + const CeedInt Q = BASIS_Q_1D; + const CeedInt u_stride = BASIS_NUM_NODES; + const CeedInt v_stride = BASIS_NUM_PTS; + const CeedInt u_comp_stride = num_elem * BASIS_NUM_NODES; + const CeedInt v_comp_stride = num_elem * BASIS_NUM_PTS; + const CeedInt u_size = BASIS_NUM_NODES; + const CeedInt u_dim_stride = 0; + const CeedInt v_dim_stride = num_elem * BASIS_NUM_PTS * BASIS_NUM_COMP; + + // Apply basis element by element + for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) { + for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) { + const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride]; + CeedInt pre = u_size; + CeedInt post = 1; + + // Map to coefficients + for (CeedInt d = 0; d < BASIS_DIM; d++) { + __syncthreads(); + // Update buffers used + pre /= P; + const CeedScalar *in = d == 0 ? cur_u : (d % 2 ? s_buffer_2 : s_buffer_1); + CeedScalar *out = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_1 : s_buffer_2); + const CeedInt writeLen = pre * post * Q; + + // Contract along middle index + for (CeedInt k = i; k < writeLen; k += blockDim.x) { + const CeedInt c = k % post; + const CeedInt j = (k / post) % Q; + const CeedInt a = k / (post * Q); + CeedScalar v_k = 0; + + for (CeedInt b = 0; b < P; b++) v_k += s_chebyshev_interp_1d[j * BASIS_P_1D + b] * in[(a * P + b) * post + c]; + out[k] = v_k; + } + post *= Q; + } + + // Map to point + __syncthreads(); + for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) { + for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) { + CeedScalar *cur_v = &v[elem * v_stride + dim_1 * v_dim_stride + comp * v_comp_stride]; + + pre = BASIS_NUM_QPTS; + post = 1; + for (CeedInt dim_2 = 0; dim_2 < BASIS_DIM; dim_2++) { + // Update buffers used + pre /= Q; + const CeedScalar *in = dim_2 == 0 ? s_chebyshev_coeffs : (dim_2 % 2 ? buffer_2 : buffer_1); + CeedScalar *out = dim_2 == BASIS_DIM - 1 ? (&cur_v[p]) : (dim_2 % 2 ? buffer_1 : buffer_2); + + // Build Chebyshev polynomial values + if (dim_1 == dim_2) ChebyshevDerivativeAtPoint(coords[elem * v_stride + dim_2 * v_comp_stride + p], chebyshev_x); + else ChebyshevPolynomialsAtPoint(coords[elem * v_stride + dim_2 * v_comp_stride + p], chebyshev_x); + + // Contract along middle index + for (CeedInt a = 0; a < pre; a++) { + for (CeedInt c = 0; c < post; c++) { + CeedScalar v_k = 0; + + for (CeedInt b = 0; b < Q; b++) v_k += chebyshev_x[b] * in[(a * Q + b) * post + c]; + out[a * post + c] = v_k; + } + } + post *= 1; + } + } + } + } + } +} + +extern "C" __global__ void GradTransposeAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ chebyshev_interp_1d, + const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ coords, + const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { + const CeedInt i = threadIdx.x; + + __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 2 * BASIS_BUF_LEN + POINTS_BUFF_LEN * BASIS_Q_1D]; + CeedScalar *s_chebyshev_interp_1d = s_mem; + CeedScalar *s_buffer_1 = s_mem + BASIS_Q_1D * BASIS_P_1D; + CeedScalar *s_buffer_2 = s_buffer_1 + BASIS_BUF_LEN; + CeedScalar *s_chebyshev_coeffs = s_buffer_2 + BASIS_BUF_LEN; + CeedScalar chebyshev_x[BASIS_Q_1D], buffer_1[POINTS_BUFF_LEN], buffer_2[POINTS_BUFF_LEN]; + for (CeedInt k = i; k < BASIS_Q_1D * BASIS_P_1D; k += blockDim.x) { + s_chebyshev_interp_1d[k] = chebyshev_interp_1d[k]; + } + + const CeedInt P = BASIS_P_1D; + const CeedInt Q = BASIS_Q_1D; + const CeedInt u_stride = BASIS_NUM_PTS; + const CeedInt v_stride = BASIS_NUM_NODES; + const CeedInt u_comp_stride = num_elem * BASIS_NUM_PTS; + const CeedInt v_comp_stride = num_elem * BASIS_NUM_NODES; + const CeedInt u_size = BASIS_NUM_PTS; + const CeedInt u_dim_stride = num_elem * BASIS_NUM_PTS * BASIS_NUM_COMP; + const CeedInt v_dim_stride = 0; + + // Apply basis element by element + for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) { + for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) { + CeedScalar *cur_v = &v[elem * v_stride + comp * v_comp_stride]; + CeedInt pre = 1; + CeedInt post = 1; + + // Clear Chebyshev coeffs + for (CeedInt k = i; k < BASIS_NUM_QPTS; k += blockDim.x) { + s_chebyshev_coeffs[k] = 0.0; + } + + // Map from point + __syncthreads(); + for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) { + if (p >= points_per_elem[elem]) continue; + for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) { + const CeedScalar *cur_u = &u[elem * u_stride + dim_1 * u_dim_stride + comp * u_comp_stride]; + + pre = 1; + post = 1; + for (CeedInt dim_2 = 0; dim_2 < BASIS_DIM; dim_2++) { + // Update buffers used + pre /= 1; + const CeedScalar *in = dim_2 == 0 ? (&cur_u[p]) : (dim_2 % 2 ? buffer_2 : buffer_1); + CeedScalar *out = dim_2 == BASIS_DIM - 1 ? s_chebyshev_coeffs : (dim_2 % 2 ? buffer_1 : buffer_2); + + // Build Chebyshev polynomial values + if (dim_1 == dim_2) ChebyshevDerivativeAtPoint(coords[elem * u_stride + dim_2 * u_comp_stride + p], chebyshev_x); + else ChebyshevPolynomialsAtPoint(coords[elem * u_stride + dim_2 * u_comp_stride + p], chebyshev_x); + + // Contract along middle index + for (CeedInt a = 0; a < pre; a++) { + for (CeedInt c = 0; c < post; c++) { + if (dim_2 == BASIS_DIM - 1) { + for (CeedInt j = 0; j < Q; j++) atomicAdd(&out[(a * Q + (j + p) % Q) * post + c], chebyshev_x[(j + p) % Q] * in[a * post + c]); + } else { + for (CeedInt j = 0; j < Q; j++) out[(a * Q + j) * post + c] = chebyshev_x[j] * in[a * post + c]; + } + } + } + post *= Q; + } + } + } + + // Map from coefficients + pre = BASIS_NUM_QPTS; + post = 1; + for (CeedInt d = 0; d < BASIS_DIM; d++) { + __syncthreads(); + // Update buffers used + pre /= Q; + const CeedScalar *in = d == 0 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_2 : s_buffer_1); + CeedScalar *out = d == BASIS_DIM - 1 ? cur_v : (d % 2 ? s_buffer_1 : s_buffer_2); + const CeedInt writeLen = pre * post * P; + + // Contract along middle index + for (CeedInt k = i; k < writeLen; k += blockDim.x) { + const CeedInt c = k % post; + const CeedInt j = (k / post) % P; + const CeedInt a = k / (post * P); + CeedScalar v_k = 0; + + for (CeedInt b = 0; b < Q; b++) v_k += s_chebyshev_interp_1d[j + b * BASIS_P_1D] * in[(a * Q + b) * post + c]; + if (d == BASIS_DIM - 1) out[k] += v_k; + else out[k] = v_k; + } + post *= P; + } + } + } +} diff --git a/include/ceed/jit-source/hip/hip-ref-basis-tensor.h b/include/ceed/jit-source/hip/hip-ref-basis-tensor.h index efbb06548b..1455b5ac21 100644 --- a/include/ceed/jit-source/hip/hip-ref-basis-tensor.h +++ b/include/ceed/jit-source/hip/hip-ref-basis-tensor.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -7,8 +7,7 @@ /// @file /// Internal header for HIP tensor product basis - -#include +#include //------------------------------------------------------------------------------ // Tensor Basis Kernels @@ -17,7 +16,7 @@ //------------------------------------------------------------------------------ // Interp //------------------------------------------------------------------------------ -extern "C" __global__ void Interp(const CeedInt num_elem, const CeedInt transpose, const CeedScalar *__restrict__ interp_1d, +extern "C" __global__ void Interp(const CeedInt num_elem, const CeedInt is_transpose, const CeedScalar *__restrict__ interp_1d, const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { const CeedInt i = threadIdx.x; @@ -29,44 +28,42 @@ extern "C" __global__ void Interp(const CeedInt num_elem, const CeedInt transpos s_interp_1d[k] = interp_1d[k]; } - const CeedInt P = transpose ? BASIS_Q_1D : BASIS_P_1D; - const CeedInt Q = transpose ? BASIS_P_1D : BASIS_Q_1D; - const CeedInt stride_0 = transpose ? 1 : BASIS_P_1D; - const CeedInt stride_1 = transpose ? BASIS_P_1D : 1; - const CeedInt u_stride = transpose ? BASIS_NUM_QPTS : BASIS_NUM_NODES; - const CeedInt v_stride = transpose ? BASIS_NUM_NODES : BASIS_NUM_QPTS; - const CeedInt u_comp_stride = num_elem * (transpose ? BASIS_NUM_QPTS : BASIS_NUM_NODES); - const CeedInt v_comp_stride = num_elem * (transpose ? BASIS_NUM_NODES : BASIS_NUM_QPTS); - const CeedInt u_size = transpose ? BASIS_NUM_QPTS : BASIS_NUM_NODES; + const CeedInt P = is_transpose ? BASIS_Q_1D : BASIS_P_1D; + const CeedInt Q = is_transpose ? BASIS_P_1D : BASIS_Q_1D; + const CeedInt stride_0 = is_transpose ? 1 : BASIS_P_1D; + const CeedInt stride_1 = is_transpose ? BASIS_P_1D : 1; + const CeedInt u_stride = is_transpose ? BASIS_NUM_QPTS : BASIS_NUM_NODES; + const CeedInt v_stride = is_transpose ? BASIS_NUM_NODES : BASIS_NUM_QPTS; + const CeedInt u_comp_stride = num_elem * (is_transpose ? BASIS_NUM_QPTS : BASIS_NUM_NODES); + const CeedInt v_comp_stride = num_elem * (is_transpose ? BASIS_NUM_NODES : BASIS_NUM_QPTS); + const CeedInt u_size = is_transpose ? BASIS_NUM_QPTS : BASIS_NUM_NODES; // Apply basis element by element for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) { for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) { - const CeedScalar *cur_u = u + elem * u_stride + comp * u_comp_stride; - CeedScalar *cur_v = v + elem * v_stride + comp * v_comp_stride; + const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride]; + CeedScalar *cur_v = &v[elem * v_stride + comp * v_comp_stride]; CeedInt pre = u_size; CeedInt post = 1; - for (CeedInt k = i; k < u_size; k += blockDim.x) { - s_buffer_1[k] = cur_u[k]; - } for (CeedInt d = 0; d < BASIS_DIM; d++) { __syncthreads(); // Update buffers used pre /= P; - const CeedScalar *in = d % 2 ? s_buffer_2 : s_buffer_1; + const CeedScalar *in = d == 0 ? cur_u : (d % 2 ? s_buffer_2 : s_buffer_1); CeedScalar *out = d == BASIS_DIM - 1 ? cur_v : (d % 2 ? s_buffer_1 : s_buffer_2); const CeedInt writeLen = pre * post * Q; // Contract along middle index for (CeedInt k = i; k < writeLen; k += blockDim.x) { - const CeedInt c = k % post; - const CeedInt j = (k / post) % Q; - const CeedInt a = k / (post * Q); - CeedScalar vk = 0; - - for (CeedInt b = 0; b < P; b++) vk += s_interp_1d[j * stride_0 + b * stride_1] * in[(a * P + b) * post + c]; - out[k] = vk; + const CeedInt c = k % post; + const CeedInt j = (k / post) % Q; + const CeedInt a = k / (post * Q); + CeedScalar v_k = 0; + + for (CeedInt b = 0; b < P; b++) v_k += s_interp_1d[j * stride_0 + b * stride_1] * in[(a * P + b) * post + c]; + if (is_transpose && d == BASIS_DIM - 1) out[k] += v_k; + else out[k] = v_k; } post *= Q; } @@ -77,7 +74,7 @@ extern "C" __global__ void Interp(const CeedInt num_elem, const CeedInt transpos //------------------------------------------------------------------------------ // Grad //------------------------------------------------------------------------------ -extern "C" __global__ void Grad(const CeedInt num_elem, const CeedInt transpose, const CeedScalar *__restrict__ interp_1d, +extern "C" __global__ void Grad(const CeedInt num_elem, const CeedInt is_transpose, const CeedScalar *__restrict__ interp_1d, const CeedScalar *__restrict__ grad_1d, const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { const CeedInt i = threadIdx.x; @@ -91,26 +88,26 @@ extern "C" __global__ void Grad(const CeedInt num_elem, const CeedInt transpose, s_grad_1d[k] = grad_1d[k]; } - const CeedInt P = transpose ? BASIS_Q_1D : BASIS_P_1D; - const CeedInt Q = transpose ? BASIS_P_1D : BASIS_Q_1D; - const CeedInt stride_0 = transpose ? 1 : BASIS_P_1D; - const CeedInt stride_1 = transpose ? BASIS_P_1D : 1; - const CeedInt u_stride = transpose ? BASIS_NUM_QPTS : BASIS_NUM_NODES; - const CeedInt v_stride = transpose ? BASIS_NUM_NODES : BASIS_NUM_QPTS; - const CeedInt u_comp_stride = num_elem * (transpose ? BASIS_NUM_QPTS : BASIS_NUM_NODES); - const CeedInt v_comp_stride = num_elem * (transpose ? BASIS_NUM_NODES : BASIS_NUM_QPTS); - const CeedInt u_dim_stride = transpose ? num_elem * BASIS_NUM_QPTS * BASIS_NUM_COMP : 0; - const CeedInt v_dim_stride = transpose ? 0 : num_elem * BASIS_NUM_QPTS * BASIS_NUM_COMP; + const CeedInt P = is_transpose ? BASIS_Q_1D : BASIS_P_1D; + const CeedInt Q = is_transpose ? BASIS_P_1D : BASIS_Q_1D; + const CeedInt stride_0 = is_transpose ? 1 : BASIS_P_1D; + const CeedInt stride_1 = is_transpose ? BASIS_P_1D : 1; + const CeedInt u_stride = is_transpose ? BASIS_NUM_QPTS : BASIS_NUM_NODES; + const CeedInt v_stride = is_transpose ? BASIS_NUM_NODES : BASIS_NUM_QPTS; + const CeedInt u_comp_stride = num_elem * (is_transpose ? BASIS_NUM_QPTS : BASIS_NUM_NODES); + const CeedInt v_comp_stride = num_elem * (is_transpose ? BASIS_NUM_NODES : BASIS_NUM_QPTS); + const CeedInt u_dim_stride = is_transpose ? num_elem * BASIS_NUM_QPTS * BASIS_NUM_COMP : 0; + const CeedInt v_dim_stride = is_transpose ? 0 : num_elem * BASIS_NUM_QPTS * BASIS_NUM_COMP; // Apply basis element by element for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) { for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) { // dim*dim contractions for grad for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) { - CeedInt pre = transpose ? BASIS_NUM_QPTS : BASIS_NUM_NODES; + CeedInt pre = is_transpose ? BASIS_NUM_QPTS : BASIS_NUM_NODES; CeedInt post = 1; - const CeedScalar *cur_u = u + elem * u_stride + dim_1 * u_dim_stride + comp * u_comp_stride; - CeedScalar *cur_v = v + elem * v_stride + dim_1 * v_dim_stride + comp * v_comp_stride; + const CeedScalar *cur_u = &u[elem * u_stride + dim_1 * u_dim_stride + comp * u_comp_stride]; + CeedScalar *cur_v = &v[elem * v_stride + dim_1 * v_dim_stride + comp * v_comp_stride]; for (CeedInt dim_2 = 0; dim_2 < BASIS_DIM; dim_2++) { __syncthreads(); @@ -129,7 +126,7 @@ extern "C" __global__ void Grad(const CeedInt num_elem, const CeedInt transpose, CeedScalar v_k = 0; for (CeedInt b = 0; b < P; b++) v_k += op[j * stride_0 + b * stride_1] * in[(a * P + b) * post + c]; - if (transpose && dim_2 == BASIS_DIM - 1) out[k] += v_k; + if (is_transpose && dim_2 == BASIS_DIM - 1) out[k] += v_k; else out[k] = v_k; } post *= Q; diff --git a/include/ceed/jit-source/hip/hip-ref-operator-assemble-diagonal.h b/include/ceed/jit-source/hip/hip-ref-operator-assemble-diagonal.h index e6a8b6e6a1..581545f71a 100644 --- a/include/ceed/jit-source/hip/hip-ref-operator-assemble-diagonal.h +++ b/include/ceed/jit-source/hip/hip-ref-operator-assemble-diagonal.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -7,8 +7,7 @@ /// @file /// Internal header for HIP operator diagonal assembly - -#include +#include #if USE_CEEDSIZE typedef CeedSize IndexType; diff --git a/include/ceed/jit-source/hip/hip-ref-operator-assemble.h b/include/ceed/jit-source/hip/hip-ref-operator-assemble.h index bf86921066..a235c8be7a 100644 --- a/include/ceed/jit-source/hip/hip-ref-operator-assemble.h +++ b/include/ceed/jit-source/hip/hip-ref-operator-assemble.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -7,8 +7,7 @@ /// @file /// Internal header for HIP operator full assembly - -#include +#include #if USE_CEEDSIZE typedef CeedSize IndexType; @@ -24,7 +23,7 @@ extern "C" __launch_bounds__(BLOCK_SIZE) __global__ const CeedInt8 *curl_orients_in, const bool *orients_out, const CeedInt8 *curl_orients_out, const CeedScalar *__restrict__ qf_array, CeedScalar *__restrict__ values_array) { extern __shared__ CeedScalar s_CT[]; - CeedScalar *s_C = s_CT + NUM_NODES_OUT * NUM_NODES_IN; + CeedScalar *s_C = &s_CT[NUM_NODES_OUT * NUM_NODES_IN]; const int l = threadIdx.x; // The output column index of each B^T D B operation // such that we have (Bout^T)_ij D_jk Bin_kl = C_il @@ -62,7 +61,7 @@ extern "C" __launch_bounds__(BLOCK_SIZE) __global__ result += B_out[b_out_index + j * NUM_NODES_OUT + i] * qf_array[qf_index + j] * B_in[b_in_index + j * NUM_NODES_IN + l]; } } // end of out eval mode - } // end of in eval mode + } // end of in eval mode if (orients_in) { result *= orients_in[NUM_NODES_IN * e + l] ? -1.0 : 1.0; } @@ -101,6 +100,6 @@ extern "C" __launch_bounds__(BLOCK_SIZE) __global__ } } } // end of out component - } // end of in component - } // end of element loop + } // end of in component + } // end of element loop } diff --git a/include/ceed/jit-source/hip/hip-ref-qfunction.h b/include/ceed/jit-source/hip/hip-ref-qfunction.h index 1b423072af..bf605feba4 100644 --- a/include/ceed/jit-source/hip/hip-ref-qfunction.h +++ b/include/ceed/jit-source/hip/hip-ref-qfunction.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -7,8 +7,7 @@ /// @file /// Internal header for HIP backend QFunction read/write kernels - -#include +#include //------------------------------------------------------------------------------ // Read from quadrature points diff --git a/include/ceed/jit-source/hip/hip-ref-restriction-at-points.h b/include/ceed/jit-source/hip/hip-ref-restriction-at-points.h new file mode 100644 index 0000000000..3c88d685a3 --- /dev/null +++ b/include/ceed/jit-source/hip/hip-ref-restriction-at-points.h @@ -0,0 +1,56 @@ +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. +// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +// +// SPDX-License-Identifier: BSD-2-Clause +// +// This file is part of CEED: http://github.com/ceed + +/// @file +/// Internal header for HIP offset element restriction kernels +#include + +//------------------------------------------------------------------------------ +// E-vector -> L-vector, standard (with offsets) +//------------------------------------------------------------------------------ +#if !USE_DETERMINISTIC +extern "C" __global__ void AtPointsTranspose(const CeedInt *__restrict__ indices, const CeedInt *__restrict__ points_per_elem, + const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { + for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < RSTR_NUM_ELEM * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) { + const CeedInt ind = indices[node]; + const CeedInt loc_node = node % RSTR_ELEM_SIZE; + const CeedInt elem = node / RSTR_ELEM_SIZE; + + if (loc_node >= points_per_elem[elem]) continue; + for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { + atomicAdd(&v[ind + comp * RSTR_COMP_STRIDE], u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE]); + } + } +} +#else +extern "C" __global__ void AtPointsTranspose(const CeedInt *__restrict__ l_vec_indices, const CeedInt *__restrict__ t_indices, + const CeedInt *__restrict__ points_per_elem, const CeedInt *__restrict__ t_offsets, + const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { + CeedScalar value[RSTR_NUM_COMP]; + + for (CeedInt i = blockIdx.x * blockDim.x + threadIdx.x; i < RSTR_NUM_NODES; i += blockDim.x * gridDim.x) { + const CeedInt ind = l_vec_indices[i]; + const CeedInt range_1 = t_offsets[i]; + const CeedInt range_N = t_offsets[i + 1]; + + for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) value[comp] = 0.0; + + for (CeedInt j = range_1; j < range_N; j++) { + const CeedInt t_ind = t_indices[j]; + const CeedInt loc_node = t_ind % RSTR_ELEM_SIZE; + const CeedInt elem = t_ind / RSTR_ELEM_SIZE; + + if (loc_node >= points_per_elem[elem]) continue; + for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { + value[comp] += u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE]; + } + } + + for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) v[ind + comp * RSTR_COMP_STRIDE] += value[comp]; + } +} +#endif diff --git a/include/ceed/jit-source/hip/hip-ref-restriction-curl-oriented.h b/include/ceed/jit-source/hip/hip-ref-restriction-curl-oriented.h index 4d3e88ce27..ee5544309d 100644 --- a/include/ceed/jit-source/hip/hip-ref-restriction-curl-oriented.h +++ b/include/ceed/jit-source/hip/hip-ref-restriction-curl-oriented.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -7,8 +7,7 @@ /// @file /// Internal header for HIP curl-oriented element restriction kernels - -#include +#include //------------------------------------------------------------------------------ // L-vector -> E-vector, curl-oriented @@ -80,7 +79,7 @@ extern "C" __global__ void CurlOrientedTranspose(const CeedInt *__restrict__ ind value += u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_d; value += loc_node < (RSTR_ELEM_SIZE - 1) ? u[loc_node + 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_dl : 0.0; - atomicAdd(v + ind + comp * RSTR_COMP_STRIDE, value); + atomicAdd(&v[ind + comp * RSTR_COMP_STRIDE], value); } } } @@ -138,7 +137,7 @@ extern "C" __global__ void CurlOrientedUnsignedTranspose(const CeedInt *__restri value += u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_d; value += loc_node < (RSTR_ELEM_SIZE - 1) ? u[loc_node + 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_dl : 0.0; - atomicAdd(v + ind + comp * RSTR_COMP_STRIDE, value); + atomicAdd(&v[ind + comp * RSTR_COMP_STRIDE], value); } } } diff --git a/include/ceed/jit-source/hip/hip-ref-restriction-offset.h b/include/ceed/jit-source/hip/hip-ref-restriction-offset.h index 26cd41ee92..a3e952b5ca 100644 --- a/include/ceed/jit-source/hip/hip-ref-restriction-offset.h +++ b/include/ceed/jit-source/hip/hip-ref-restriction-offset.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -7,8 +7,7 @@ /// @file /// Internal header for HIP offset element restriction kernels - -#include +#include //------------------------------------------------------------------------------ // L-vector -> E-vector, standard (with offsets) @@ -36,7 +35,7 @@ extern "C" __global__ void OffsetTranspose(const CeedInt *__restrict__ indices, const CeedInt elem = node / RSTR_ELEM_SIZE; for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { - atomicAdd(v + ind + comp * RSTR_COMP_STRIDE, u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE]); + atomicAdd(&v[ind + comp * RSTR_COMP_STRIDE], u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE]); } } } diff --git a/include/ceed/jit-source/hip/hip-ref-restriction-oriented.h b/include/ceed/jit-source/hip/hip-ref-restriction-oriented.h index cb987fa8a7..ffe8890ef2 100644 --- a/include/ceed/jit-source/hip/hip-ref-restriction-oriented.h +++ b/include/ceed/jit-source/hip/hip-ref-restriction-oriented.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -7,8 +7,7 @@ /// @file /// Internal header for HIP oriented element restriction kernels - -#include +#include //------------------------------------------------------------------------------ // L-vector -> E-vector, oriented @@ -40,7 +39,7 @@ extern "C" __global__ void OrientedTranspose(const CeedInt *__restrict__ indices const CeedInt elem = node / RSTR_ELEM_SIZE; for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { - atomicAdd(v + ind + comp * RSTR_COMP_STRIDE, + atomicAdd(&v[ind + comp * RSTR_COMP_STRIDE], u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * (orient ? -1.0 : 1.0)); } } diff --git a/include/ceed/jit-source/hip/hip-ref-restriction-strided.h b/include/ceed/jit-source/hip/hip-ref-restriction-strided.h index de1335c117..445aede42d 100644 --- a/include/ceed/jit-source/hip/hip-ref-restriction-strided.h +++ b/include/ceed/jit-source/hip/hip-ref-restriction-strided.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -7,8 +7,7 @@ /// @file /// Internal header for HIP strided element restriction kernels - -#include +#include //------------------------------------------------------------------------------ // L-vector -> E-vector, strided diff --git a/include/ceed/jit-source/hip/hip-shared-basis-nontensor-templates.h b/include/ceed/jit-source/hip/hip-shared-basis-nontensor-templates.h new file mode 100644 index 0000000000..71d183bcf8 --- /dev/null +++ b/include/ceed/jit-source/hip/hip-shared-basis-nontensor-templates.h @@ -0,0 +1,98 @@ +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. +// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +// +// SPDX-License-Identifier: BSD-2-Clause +// +// This file is part of CEED: http://github.com/ceed + +/// @file +/// Internal header for HIP shared memory non-tensor basis templates +#include + +//------------------------------------------------------------------------------ +// 1D tensor contraction +//------------------------------------------------------------------------------ +template +inline __device__ void Contract1d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { + data.slice[data.t_id_x] = *U; + __syncthreads(); + *V = 0.0; + if (data.t_id_x < Q_1D) { + for (CeedInt i = 0; i < P_1D; i++) { + *V += B[i + data.t_id_x * P_1D] * data.slice[i]; // Contract x direction + } + } + __syncthreads(); +} + +//------------------------------------------------------------------------------ +// 1D transpose tensor contraction +//------------------------------------------------------------------------------ +template +inline __device__ void ContractTranspose1d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { + data.slice[data.t_id_x] = *U; + __syncthreads(); + if (data.t_id_x < P_1D) { + for (CeedInt i = 0; i < Q_1D; i++) { + *V += B[data.t_id_x + i * P_1D] * data.slice[i]; // Contract x direction + } + } + __syncthreads(); +} + +//------------------------------------------------------------------------------ +// Interpolate to quadrature points +//------------------------------------------------------------------------------ +template +inline __device__ void InterpNonTensor(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, + CeedScalar *__restrict__ r_V) { + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + Contract1d(data, &r_U[comp], c_B, &r_V[comp]); + } +} + +//------------------------------------------------------------------------------ +// Interpolate transpose +//------------------------------------------------------------------------------ +template +inline __device__ void InterpTransposeNonTensor(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, + CeedScalar *__restrict__ r_V) { + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + r_V[comp] = 0.0; + ContractTranspose1d(data, &r_U[comp], c_B, &r_V[comp]); + } +} + +//------------------------------------------------------------------------------ +// Derivatives at quadrature points +//------------------------------------------------------------------------------ +template +inline __device__ void GradNonTensor(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { + for (CeedInt dim = 0; dim < DIM; dim++) { + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + Contract1d(data, &r_U[comp], &c_G[dim * P * Q], &r_V[comp + dim * NUM_COMP]); + } + } +} + +//------------------------------------------------------------------------------ +// Derivatives transpose +//------------------------------------------------------------------------------ +template +inline __device__ void GradTransposeNonTensor(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, + CeedScalar *__restrict__ r_V) { + for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_V[comp] = 0.0; + for (CeedInt dim = 0; dim < DIM; dim++) { + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + ContractTranspose1d(data, &r_U[comp + dim * NUM_COMP], &c_G[dim * P * Q], &r_V[comp]); + } + } +} + +//------------------------------------------------------------------------------ +// Quadrature weights +//------------------------------------------------------------------------------ +template +inline __device__ void WeightNonTensor(SharedData_Hip &data, const CeedScalar *__restrict__ q_weight, CeedScalar *w) { + *w = (data.t_id_x < Q) ? q_weight[data.t_id_x] : 0.0; +} diff --git a/include/ceed/jit-source/hip/hip-shared-basis-nontensor.h b/include/ceed/jit-source/hip/hip-shared-basis-nontensor.h new file mode 100644 index 0000000000..175e720a55 --- /dev/null +++ b/include/ceed/jit-source/hip/hip-shared-basis-nontensor.h @@ -0,0 +1,203 @@ +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. +// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +// +// SPDX-License-Identifier: BSD-2-Clause +// +// This file is part of CEED: http://github.com/ceed + +/// @file +/// Internal header for HIP shared memory non-tensor basis +#include + +#include "hip-shared-basis-nontensor-templates.h" +#include "hip-shared-basis-read-write-templates.h" + +//------------------------------------------------------------------------------ +// Interp kernels +//------------------------------------------------------------------------------ +extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__ + void Interp(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { + extern __shared__ CeedScalar slice[]; + + SharedData_Hip data; + data.t_id_x = threadIdx.x; + data.t_id_y = threadIdx.y; + data.t_id_z = threadIdx.z; + data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; + data.slice = slice + data.t_id_z * BASIS_T_1D; + + CeedScalar r_U[BASIS_NUM_COMP]; + CeedScalar r_V[BASIS_NUM_COMP]; + + // load interp into shared memory + __shared__ CeedScalar s_B[BASIS_P * BASIS_Q]; + LoadMatrix(data, c_B, s_B); + __syncthreads(); + + // Apply basis element by element + for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { + ReadElementStrided1d(data, elem, 1, BASIS_P * num_elem, BASIS_P, d_U, r_U); + InterpNonTensor(data, r_U, s_B, r_V); + WriteElementStrided1d(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, r_V, d_V); + } +} + +extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__ + void InterpTranspose(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { + extern __shared__ CeedScalar slice[]; + + SharedData_Hip data; + data.t_id_x = threadIdx.x; + data.t_id_y = threadIdx.y; + data.t_id_z = threadIdx.z; + data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; + data.slice = slice + data.t_id_z * BASIS_T_1D; + + CeedScalar r_U[BASIS_NUM_COMP]; + CeedScalar r_V[BASIS_NUM_COMP]; + + // load interp into shared memory + __shared__ CeedScalar s_B[BASIS_P * BASIS_Q]; + LoadMatrix(data, c_B, s_B); + __syncthreads(); + + // Apply basis element by element + for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { + ReadElementStrided1d(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, d_U, r_U); + InterpTransposeNonTensor(data, r_U, s_B, r_V); + WriteElementStrided1d(data, elem, 1, BASIS_P * num_elem, BASIS_P, r_V, d_V); + } +} + +extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__ + void InterpTransposeAdd(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { + extern __shared__ CeedScalar slice[]; + + SharedData_Hip data; + data.t_id_x = threadIdx.x; + data.t_id_y = threadIdx.y; + data.t_id_z = threadIdx.z; + data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; + data.slice = slice + data.t_id_z * BASIS_T_1D; + + CeedScalar r_U[BASIS_NUM_COMP]; + CeedScalar r_V[BASIS_NUM_COMP]; + + // load interp into shared memory + __shared__ CeedScalar s_B[BASIS_P * BASIS_Q]; + LoadMatrix(data, c_B, s_B); + __syncthreads(); + + // Apply basis element by element + for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { + ReadElementStrided1d(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, d_U, r_U); + InterpTransposeNonTensor(data, r_U, s_B, r_V); + SumElementStrided1d(data, elem, 1, BASIS_P * num_elem, BASIS_P, r_V, d_V); + } +} + +//------------------------------------------------------------------------------ +// Grad kernels +//------------------------------------------------------------------------------ +extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__ + void Grad(const CeedInt num_elem, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { + extern __shared__ CeedScalar slice[]; + + SharedData_Hip data; + data.t_id_x = threadIdx.x; + data.t_id_y = threadIdx.y; + data.t_id_z = threadIdx.z; + data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; + data.slice = slice + data.t_id_z * BASIS_T_1D; + + CeedScalar r_U[BASIS_NUM_COMP]; + CeedScalar r_V[BASIS_NUM_COMP * BASIS_DIM]; + + // load grad into shared memory + __shared__ CeedScalar s_G[BASIS_P * BASIS_Q * BASIS_DIM]; + LoadMatrix(data, c_G, s_G); + __syncthreads(); + + // Apply basis element by element + for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { + ReadElementStrided1d(data, elem, 1, BASIS_P * num_elem, BASIS_P, d_U, r_U); + GradNonTensor(data, r_U, s_G, r_V); + WriteElementStrided1d(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, r_V, d_V); + } +} + +extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__ + void GradTranspose(const CeedInt num_elem, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { + extern __shared__ CeedScalar slice[]; + + SharedData_Hip data; + data.t_id_x = threadIdx.x; + data.t_id_y = threadIdx.y; + data.t_id_z = threadIdx.z; + data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; + data.slice = slice + data.t_id_z * BASIS_T_1D; + + CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM]; + CeedScalar r_V[BASIS_NUM_COMP]; + + // load grad into shared memory + __shared__ CeedScalar s_G[BASIS_P * BASIS_Q * BASIS_DIM]; + LoadMatrix(data, c_G, s_G); + __syncthreads(); + + // Apply basis element by element + for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { + ReadElementStrided1d(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, d_U, r_U); + GradTransposeNonTensor(data, r_U, s_G, r_V); + WriteElementStrided1d(data, elem, 1, BASIS_P * num_elem, BASIS_P, r_V, d_V); + } +} + +extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__ + void GradTransposeAdd(const CeedInt num_elem, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { + extern __shared__ CeedScalar slice[]; + + SharedData_Hip data; + data.t_id_x = threadIdx.x; + data.t_id_y = threadIdx.y; + data.t_id_z = threadIdx.z; + data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; + data.slice = slice + data.t_id_z * BASIS_T_1D; + + CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM]; + CeedScalar r_V[BASIS_NUM_COMP]; + + // load grad into shared memory + __shared__ CeedScalar s_G[BASIS_P * BASIS_Q * BASIS_DIM]; + LoadMatrix(data, c_G, s_G); + __syncthreads(); + + // Apply basis element by element + for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { + ReadElementStrided1d(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, d_U, r_U); + GradTransposeNonTensor(data, r_U, s_G, r_V); + SumElementStrided1d(data, elem, 1, BASIS_P * num_elem, BASIS_P, r_V, d_V); + } +} + +//------------------------------------------------------------------------------ +// Weight kernel +//------------------------------------------------------------------------------ +extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__ + void Weight(const CeedInt num_elem, const CeedScalar *__restrict__ q_weight, CeedScalar *__restrict__ d_W) { + extern __shared__ CeedScalar slice[]; + + SharedData_Hip data; + data.t_id_x = threadIdx.x; + data.t_id_y = threadIdx.y; + data.t_id_z = threadIdx.z; + data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; + data.slice = slice + data.t_id_z * BASIS_T_1D; + + CeedScalar r_W[1]; + + for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { + WeightNonTensor(data, q_weight, r_W); + WriteElementStrided1d<1, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, r_W, d_W); + } +} diff --git a/include/ceed/jit-source/hip/hip-shared-basis-read-write-templates.h b/include/ceed/jit-source/hip/hip-shared-basis-read-write-templates.h index a6d945ac56..80be446bee 100644 --- a/include/ceed/jit-source/hip/hip-shared-basis-read-write-templates.h +++ b/include/ceed/jit-source/hip/hip-shared-basis-read-write-templates.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -7,17 +7,14 @@ /// @file /// Internal header for HIP shared memory basis read/write templates - -#include +#include //------------------------------------------------------------------------------ // Helper function: load matrices for basis actions //------------------------------------------------------------------------------ -template -inline __device__ void loadMatrix(const CeedScalar *d_B, CeedScalar *B) { - CeedInt tid = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; - - for (CeedInt i = tid; i < SIZE; i += blockDim.x * blockDim.y * blockDim.z) B[i] = d_B[i]; +template +inline __device__ void LoadMatrix(SharedData_Hip &data, const CeedScalar *__restrict__ d_B, CeedScalar *B) { + for (CeedInt i = data.t_id; i < P * Q; i += blockDim.x * blockDim.y * blockDim.z) B[i] = d_B[i]; } //------------------------------------------------------------------------------ @@ -56,6 +53,19 @@ inline __device__ void WriteElementStrided1d(SharedData_Hip &data, const CeedInt } } +template +inline __device__ void SumElementStrided1d(SharedData_Hip &data, const CeedInt elem, const CeedInt strides_node, const CeedInt strides_comp, + const CeedInt strides_elem, const CeedScalar *r_v, CeedScalar *d_v) { + if (data.t_id_x < P_1D) { + const CeedInt node = data.t_id_x; + const CeedInt ind = node * strides_node + elem * strides_elem; + + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + d_v[ind + comp * strides_comp] += r_v[comp]; + } + } +} + //------------------------------------------------------------------------------ // 2D //------------------------------------------------------------------------------ @@ -92,6 +102,19 @@ inline __device__ void WriteElementStrided2d(SharedData_Hip &data, const CeedInt } } +template +inline __device__ void SumElementStrided2d(SharedData_Hip &data, const CeedInt elem, const CeedInt strides_node, const CeedInt strides_comp, + const CeedInt strides_elem, const CeedScalar *r_v, CeedScalar *d_v) { + if (data.t_id_x < P_1D && data.t_id_y < P_1D) { + const CeedInt node = data.t_id_x + data.t_id_y * P_1D; + const CeedInt ind = node * strides_node + elem * strides_elem; + + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + d_v[ind + comp * strides_comp] += r_v[comp]; + } + } +} + //------------------------------------------------------------------------------ // 3D //------------------------------------------------------------------------------ @@ -131,3 +154,57 @@ inline __device__ void WriteElementStrided3d(SharedData_Hip &data, const CeedInt } } } + +template +inline __device__ void SumElementStrided3d(SharedData_Hip &data, const CeedInt elem, const CeedInt strides_node, const CeedInt strides_comp, + const CeedInt strides_elem, const CeedScalar *r_v, CeedScalar *d_v) { + if (data.t_id_x < P_1D && data.t_id_y < P_1D) { + for (CeedInt z = 0; z < P_1D; z++) { + const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D; + const CeedInt ind = node * strides_node + elem * strides_elem; + + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + d_v[ind + comp * strides_comp] += r_v[z + comp * P_1D]; + } + } + } +} + +//------------------------------------------------------------------------------ +// AtPoints +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ +// E-vector -> single point +//------------------------------------------------------------------------------ +template +inline __device__ void ReadPoint(SharedData_Hip &data, const CeedInt elem, const CeedInt p, const CeedInt points_in_elem, const CeedInt strides_point, + const CeedInt strides_comp, const CeedInt strides_elem, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { + const CeedInt ind = (p % NUM_PTS) * strides_point + elem * strides_elem; + + if (p < points_in_elem) { + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + r_u[comp] = d_u[ind + comp * strides_comp]; + } + } else { + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + r_u[comp] = 0.0; + } + } +} + +//------------------------------------------------------------------------------ +// Single point -> E-vector +//------------------------------------------------------------------------------ +template +inline __device__ void WritePoint(SharedData_Hip &data, const CeedInt elem, const CeedInt p, const CeedInt points_in_elem, + const CeedInt strides_point, const CeedInt strides_comp, const CeedInt strides_elem, const CeedScalar *r_v, + CeedScalar *d_v) { + if (p < points_in_elem) { + const CeedInt ind = (p % NUM_PTS) * strides_point + elem * strides_elem; + + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + d_v[ind + comp * strides_comp] = r_v[comp]; + } + } +} diff --git a/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h b/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h new file mode 100644 index 0000000000..d93ce6c90b --- /dev/null +++ b/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h @@ -0,0 +1,467 @@ +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. +// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +// +// SPDX-License-Identifier: BSD-2-Clause +// +// This file is part of CEED: http://github.com/ceed + +/// @file +/// Internal header for HIP shared memory tensor product basis AtPoints templates +#include + +//------------------------------------------------------------------------------ +// Chebyshev values +//------------------------------------------------------------------------------ +template +inline __device__ void ChebyshevPolynomialsAtPoint(const CeedScalar x, CeedScalar *chebyshev_x) { + chebyshev_x[0] = 1.0; + chebyshev_x[1] = 2 * x; + for (CeedInt i = 2; i < Q_1D; i++) chebyshev_x[i] = 2 * x * chebyshev_x[i - 1] - chebyshev_x[i - 2]; +} + +template +inline __device__ void ChebyshevDerivativeAtPoint(const CeedScalar x, CeedScalar *chebyshev_dx) { + CeedScalar chebyshev_x[3]; + + chebyshev_x[1] = 1.0; + chebyshev_x[2] = 2 * x; + chebyshev_dx[0] = 0.0; + chebyshev_dx[1] = 2.0; + for (CeedInt i = 2; i < Q_1D; i++) { + chebyshev_x[(i + 1) % 3] = 2 * x * chebyshev_x[(i + 0) % 3] - chebyshev_x[(i + 2) % 3]; + chebyshev_dx[i] = 2 * x * chebyshev_dx[i - 1] + 2 * chebyshev_x[(i + 0) % 3] - chebyshev_dx[i - 2]; + } +} + +//------------------------------------------------------------------------------ +// 1D +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ +// 1D interpolate to points +//------------------------------------------------------------------------------ +template +inline __device__ void InterpAtPoints1d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X, + CeedScalar *__restrict__ r_V) { + CeedScalar chebyshev_x[Q_1D]; + + for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0; + ChebyshevPolynomialsAtPoint(r_X[0], chebyshev_x); + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + // Load coefficients + __syncthreads(); + if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = r_C[comp]; + __syncthreads(); + // Contract x direction + for (CeedInt i = 0; i < Q_1D; i++) { + r_V[comp] += chebyshev_x[i] * data.slice[i]; + } + } +} + +//------------------------------------------------------------------------------ +// 1D interpolate transpose +//------------------------------------------------------------------------------ +template +inline __device__ void InterpTransposeAtPoints1d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U, + const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) { + CeedScalar chebyshev_x[Q_1D]; + + ChebyshevPolynomialsAtPoint(r_X[0], chebyshev_x); + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + // Clear shared memory + if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = 0.0; + __syncthreads(); + // Contract x direction + if (p < NUM_POINTS) { + for (CeedInt i = 0; i < Q_1D; i++) { + atomicAdd(&data.slice[comp * Q_1D + (i + data.t_id_x) % Q_1D], chebyshev_x[(i + data.t_id_x) % Q_1D] * r_U[comp]); + } + } + // Pull from shared to register + __syncthreads(); + if (data.t_id_x < Q_1D) r_C[comp] += data.slice[data.t_id_x]; + } +} + +//------------------------------------------------------------------------------ +// 1D derivatives at points +//------------------------------------------------------------------------------ +template +inline __device__ void GradAtPoints1d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X, + CeedScalar *__restrict__ r_V) { + CeedScalar chebyshev_x[Q_1D]; + + ChebyshevDerivativeAtPoint(r_X[0], chebyshev_x); + for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0; + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + __syncthreads(); + // Load coefficients + if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = r_C[comp]; + __syncthreads(); + // Contract x direction + for (CeedInt i = 0; i < Q_1D; i++) { + r_V[comp] += chebyshev_x[i] * data.slice[i]; + } + } +} + +//------------------------------------------------------------------------------ +// 1D derivatives transpose +//------------------------------------------------------------------------------ +template +inline __device__ void GradTransposeAtPoints1d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U, + const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) { + CeedScalar chebyshev_x[Q_1D]; + + ChebyshevDerivativeAtPoint(r_X[0], chebyshev_x); + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + // Clear shared memory + if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = 0.0; + __syncthreads(); + // Contract x direction + if (p < NUM_POINTS) { + for (CeedInt i = 0; i < Q_1D; i++) { + atomicAdd(&data.slice[comp * Q_1D + (i + data.t_id_x) % Q_1D], chebyshev_x[(i + data.t_id_x) % Q_1D] * r_U[comp]); + } + } + // Pull from shared to register + __syncthreads(); + if (data.t_id_x < Q_1D) r_C[comp] += data.slice[data.t_id_x]; + } +} + +//------------------------------------------------------------------------------ +// 2D +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ +// 2D interpolate to points +//------------------------------------------------------------------------------ +template +inline __device__ void InterpAtPoints2d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X, + CeedScalar *__restrict__ r_V) { + for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0; + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + CeedScalar buffer[Q_1D]; + CeedScalar chebyshev_x[Q_1D]; + + // Load coefficients + __syncthreads(); + if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[comp]; + __syncthreads(); + // Contract x direction + ChebyshevPolynomialsAtPoint(r_X[0], chebyshev_x); + for (CeedInt i = 0; i < Q_1D; i++) { + buffer[i] = 0.0; + for (CeedInt j = 0; j < Q_1D; j++) { + buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D]; + } + } + // Contract y direction + ChebyshevPolynomialsAtPoint(r_X[1], chebyshev_x); + for (CeedInt i = 0; i < Q_1D; i++) { + r_V[comp] += chebyshev_x[i] * buffer[i]; + } + } +} + +//------------------------------------------------------------------------------ +// 2D interpolate transpose +//------------------------------------------------------------------------------ +template +inline __device__ void InterpTransposeAtPoints2d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U, + const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) { + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + CeedScalar buffer[Q_1D]; + CeedScalar chebyshev_x[Q_1D]; + + // Clear shared memory + if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0; + __syncthreads(); + // Contract y direction + ChebyshevPolynomialsAtPoint(r_X[1], chebyshev_x); + const CeedScalar r_u = p < NUM_POINTS ? r_U[comp] : 0.0; + + for (CeedInt i = 0; i < Q_1D; i++) { + buffer[i] = chebyshev_x[i] * r_u; + } + // Contract x direction + ChebyshevPolynomialsAtPoint(r_X[0], chebyshev_x); + for (CeedInt i = 0; i < Q_1D; i++) { + // Note: shifting to avoid atomic adds + const CeedInt ii = (i + data.t_id_y) % Q_1D; + + for (CeedInt j = 0; j < Q_1D; j++) { + const CeedInt jj = (j + data.t_id_x) % Q_1D; + + if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]); + } + } + // Pull from shared to register + __syncthreads(); + if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[comp] += data.slice[data.t_id_x + data.t_id_y * Q_1D]; + } +} + +//------------------------------------------------------------------------------ +// 2D derivatives at points +//------------------------------------------------------------------------------ +template +inline __device__ void GradAtPoints2d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X, + CeedScalar *__restrict__ r_V) { + for (CeedInt i = 0; i < NUM_COMP * 2; i++) r_V[i] = 0.0; + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + CeedScalar buffer[Q_1D]; + CeedScalar chebyshev_x[Q_1D]; + + // Load coefficients + __syncthreads(); + if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[comp]; + __syncthreads(); + for (CeedInt dim = 0; dim < 2; dim++) { + // Contract x direction + if (dim == 0) ChebyshevDerivativeAtPoint(r_X[0], chebyshev_x); + else ChebyshevPolynomialsAtPoint(r_X[0], chebyshev_x); + for (CeedInt i = 0; i < Q_1D; i++) { + buffer[i] = 0.0; + for (CeedInt j = 0; j < Q_1D; j++) { + buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D]; + } + } + // Contract y direction + if (dim == 1) ChebyshevDerivativeAtPoint(r_X[1], chebyshev_x); + else ChebyshevPolynomialsAtPoint(r_X[1], chebyshev_x); + for (CeedInt i = 0; i < Q_1D; i++) { + r_V[comp + dim * NUM_COMP] += chebyshev_x[i] * buffer[i]; + } + } + } +} + +//------------------------------------------------------------------------------ +// 2D derivatives transpose +//------------------------------------------------------------------------------ +template +inline __device__ void GradTransposeAtPoints2d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U, + const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) { + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + CeedScalar buffer[Q_1D]; + CeedScalar chebyshev_x[Q_1D]; + + // Clear shared memory + if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0; + __syncthreads(); + for (CeedInt dim = 0; dim < 2; dim++) { + // Contract y direction + if (dim == 1) ChebyshevDerivativeAtPoint(r_X[1], chebyshev_x); + else ChebyshevPolynomialsAtPoint(r_X[1], chebyshev_x); + const CeedScalar r_u = p < NUM_POINTS ? r_U[comp + dim * NUM_COMP] : 0.0; + + for (CeedInt i = 0; i < Q_1D; i++) { + buffer[i] = chebyshev_x[i] * r_u; + } + // Contract x direction + if (dim == 0) ChebyshevDerivativeAtPoint(r_X[0], chebyshev_x); + else ChebyshevPolynomialsAtPoint(r_X[0], chebyshev_x); + for (CeedInt i = 0; i < Q_1D; i++) { + // Note: shifting to avoid atomic adds + const CeedInt ii = (i + data.t_id_y) % Q_1D; + + for (CeedInt j = 0; j < Q_1D; j++) { + const CeedInt jj = (j + data.t_id_x) % Q_1D; + + if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]); + } + } + } + // Pull from shared to register + __syncthreads(); + if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[comp] += data.slice[data.t_id_x + data.t_id_y * Q_1D]; + } +} + +//------------------------------------------------------------------------------ +// 3D +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ +// 3D interpolate to points +//------------------------------------------------------------------------------ +template +inline __device__ void InterpAtPoints3d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X, + CeedScalar *__restrict__ r_V) { + for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0; + for (CeedInt k = 0; k < Q_1D; k++) { + CeedScalar buffer[Q_1D]; + CeedScalar chebyshev_x[Q_1D]; + + // Get z contraction value + ChebyshevPolynomialsAtPoint(r_X[2], chebyshev_x); + const CeedScalar z = chebyshev_x[k]; + + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + // Load coefficients + __syncthreads(); + if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[k + comp * Q_1D]; + __syncthreads(); + // Contract x direction + ChebyshevPolynomialsAtPoint(r_X[0], chebyshev_x); + for (CeedInt i = 0; i < Q_1D; i++) { + buffer[i] = 0.0; + for (CeedInt j = 0; j < Q_1D; j++) { + buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D]; + } + } + // Contract y and z direction + ChebyshevPolynomialsAtPoint(r_X[1], chebyshev_x); + for (CeedInt i = 0; i < Q_1D; i++) { + r_V[comp] += chebyshev_x[i] * buffer[i] * z; + } + } + } +} + +//------------------------------------------------------------------------------ +// 3D interpolate transpose +//------------------------------------------------------------------------------ +template +inline __device__ void InterpTransposeAtPoints3d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U, + const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) { + for (CeedInt k = 0; k < Q_1D; k++) { + CeedScalar buffer[Q_1D]; + CeedScalar chebyshev_x[Q_1D]; + + // Get z contraction value + ChebyshevPolynomialsAtPoint(r_X[2], chebyshev_x); + const CeedScalar z = chebyshev_x[k]; + + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + // Clear shared memory + if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0; + __syncthreads(); + // Contract y and z direction + ChebyshevPolynomialsAtPoint(r_X[1], chebyshev_x); + const CeedScalar r_u = p < NUM_POINTS ? r_U[comp] : 0.0; + + for (CeedInt i = 0; i < Q_1D; i++) { + buffer[i] = chebyshev_x[i] * r_u * z; + } + // Contract x direction + ChebyshevPolynomialsAtPoint(r_X[0], chebyshev_x); + for (CeedInt i = 0; i < Q_1D; i++) { + // Note: shifting to avoid atomic adds + const CeedInt ii = (i + data.t_id_y) % Q_1D; + + for (CeedInt j = 0; j < Q_1D; j++) { + const CeedInt jj = (j + data.t_id_x) % Q_1D; + + if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]); + } + } + // Pull from shared to register + __syncthreads(); + if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[k + comp * Q_1D] += data.slice[data.t_id_x + data.t_id_y * Q_1D]; + } + } +} + +//------------------------------------------------------------------------------ +// 3D derivatives at points +//------------------------------------------------------------------------------ +template +inline __device__ void GradAtPoints3d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X, + CeedScalar *__restrict__ r_V) { + for (CeedInt i = 0; i < NUM_COMP * 3; i++) r_V[i] = 0.0; + for (CeedInt k = 0; k < Q_1D; k++) { + CeedScalar buffer[Q_1D]; + CeedScalar chebyshev_x[Q_1D]; + + // Get z contraction values + ChebyshevPolynomialsAtPoint(r_X[2], chebyshev_x); + const CeedScalar z = chebyshev_x[k]; + + ChebyshevDerivativeAtPoint(r_X[2], chebyshev_x); + const CeedScalar dz = chebyshev_x[k]; + + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + // Load coefficients + __syncthreads(); + if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[k + comp * Q_1D]; + __syncthreads(); + // Gradient directions + for (CeedInt dim = 0; dim < 3; dim++) { + // Contract x direction + if (dim == 0) ChebyshevDerivativeAtPoint(r_X[0], chebyshev_x); + else ChebyshevPolynomialsAtPoint(r_X[0], chebyshev_x); + for (CeedInt i = 0; i < Q_1D; i++) { + buffer[i] = 0.0; + for (CeedInt j = 0; j < Q_1D; j++) { + buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D]; + } + } + // Contract y and z direction + if (dim == 1) ChebyshevDerivativeAtPoint(r_X[1], chebyshev_x); + else ChebyshevPolynomialsAtPoint(r_X[1], chebyshev_x); + const CeedScalar zz = dim == 2 ? dz : z; + + for (CeedInt i = 0; i < Q_1D; i++) { + r_V[comp + dim * NUM_COMP] += chebyshev_x[i] * buffer[i] * zz; + } + } + } + } +} + +//------------------------------------------------------------------------------ +// 3D derivatives transpose +//------------------------------------------------------------------------------ +template +inline __device__ void GradTransposeAtPoints3d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U, + const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) { + for (CeedInt k = 0; k < Q_1D; k++) { + CeedScalar buffer[Q_1D]; + CeedScalar chebyshev_x[Q_1D]; + + // Get z contraction values + ChebyshevPolynomialsAtPoint(r_X[2], chebyshev_x); + const CeedScalar z = chebyshev_x[k]; + + ChebyshevDerivativeAtPoint(r_X[2], chebyshev_x); + const CeedScalar dz = chebyshev_x[k]; + + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + // Clear shared memory + if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0; + __syncthreads(); + // Gradient directions + for (CeedInt dim = 0; dim < 3; dim++) { + // Contract y and z direction + if (dim == 1) ChebyshevDerivativeAtPoint(r_X[1], chebyshev_x); + else ChebyshevPolynomialsAtPoint(r_X[1], chebyshev_x); + const CeedScalar zz = dim == 2 ? dz : z; + const CeedScalar r_u = p < NUM_POINTS ? r_U[comp + dim * NUM_COMP] : 0.0; + + for (CeedInt i = 0; i < Q_1D; i++) { + buffer[i] = chebyshev_x[i] * r_u * zz; + } + // Contract x direction + if (dim == 0) ChebyshevDerivativeAtPoint(r_X[0], chebyshev_x); + else ChebyshevPolynomialsAtPoint(r_X[0], chebyshev_x); + for (CeedInt i = 0; i < Q_1D; i++) { + // Note: shifting to avoid atomic adds + const CeedInt ii = (i + data.t_id_y) % Q_1D; + + for (CeedInt j = 0; j < Q_1D; j++) { + const CeedInt jj = (j + data.t_id_x) % Q_1D; + + if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]); + } + } + } + // Pull from shared to register + __syncthreads(); + if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[k + comp * Q_1D] += data.slice[data.t_id_x + data.t_id_y * Q_1D]; + } + } +} diff --git a/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points.h b/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points.h new file mode 100644 index 0000000000..f30e6070c4 --- /dev/null +++ b/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points.h @@ -0,0 +1,396 @@ +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. +// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +// +// SPDX-License-Identifier: BSD-2-Clause +// +// This file is part of CEED: http://github.com/ceed + +/// @file +/// Internal header for HIP tensor product basis with AtPoints evaluation +#include + +#include "hip-shared-basis-read-write-templates.h" +#include "hip-shared-basis-tensor-at-points-templates.h" +#include "hip-shared-basis-tensor-templates.h" + +//------------------------------------------------------------------------------ +// Tensor Basis Kernels AtPoints +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ +// Interp +//------------------------------------------------------------------------------ +extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__ + void InterpAtPoints(const CeedInt num_elem, const CeedScalar *c_B, const CeedInt *points_per_elem, const CeedScalar *__restrict__ d_X, + const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { + extern __shared__ CeedScalar slice[]; + + SharedData_Hip data; + data.t_id_x = threadIdx.x; + data.t_id_y = threadIdx.y; + data.t_id_z = threadIdx.z; + data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; + data.slice = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1); + + CeedScalar r_X[BASIS_DIM]; + CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)]; + CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; + CeedScalar r_V[BASIS_NUM_COMP]; + + // load chebyshev_interp_1d into shared memory + __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; + LoadMatrix(data, c_B, s_B); + __syncthreads(); + + // Apply basis element by element + for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { + // Map to coefficients + if (BASIS_DIM == 1) { + ReadElementStrided1d(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U); + Interp1d(data, r_U, s_B, r_C); + } else if (BASIS_DIM == 2) { + ReadElementStrided2d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U); + InterpTensor2d(data, r_U, s_B, r_C); + } else if (BASIS_DIM == 3) { + ReadElementStrided3d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, + BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U); + InterpTensor3d(data, r_U, s_B, r_C); + } + + // Map to points + const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y)); + + for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) { + const CeedInt p = i % BASIS_NUM_PTS; + + ReadPoint(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_X, r_X); + if (BASIS_DIM == 1) { + InterpAtPoints1d(data, i, r_C, r_X, r_V); + } else if (BASIS_DIM == 2) { + InterpAtPoints2d(data, i, r_C, r_X, r_V); + } else if (BASIS_DIM == 3) { + InterpAtPoints3d(data, i, r_C, r_X, r_V); + } + WritePoint(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, r_V, d_V); + } + } +} + +extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__ + void InterpTransposeAtPoints(const CeedInt num_elem, const CeedScalar *c_B, const CeedInt *points_per_elem, const CeedScalar *__restrict__ d_X, + const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { + extern __shared__ CeedScalar slice[]; + + SharedData_Hip data; + data.t_id_x = threadIdx.x; + data.t_id_y = threadIdx.y; + data.t_id_z = threadIdx.z; + data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; + data.slice = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1); + + CeedScalar r_X[BASIS_DIM]; + CeedScalar r_U[BASIS_NUM_COMP]; + CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; + CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; + + // load chebyshev_interp_1d into shared memory + __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; + LoadMatrix(data, c_B, s_B); + __syncthreads(); + + // Apply basis element by element + for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { + // Clear register + for (CeedInt i = 0; i < BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1); i++) r_C[i] = 0.0; + + // Clear output vector + for (CeedInt i = 0; i < BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1); i++) r_V[i] = 0.0; + if (BASIS_DIM == 1) { + WriteElementStrided1d(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V); + } else if (BASIS_DIM == 2) { + WriteElementStrided2d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V); + } else if (BASIS_DIM == 3) { + WriteElementStrided3d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, + BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V); + } + + // Map from points + const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y)); + + for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) { + const CeedInt p = i % BASIS_NUM_PTS; + + ReadPoint(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_X, r_X); + ReadPoint(data, elem, i, points_per_elem[elem], 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_U, r_U); + if (BASIS_DIM == 1) { + InterpTransposeAtPoints1d(data, i, r_U, r_X, r_C); + } else if (BASIS_DIM == 2) { + InterpTransposeAtPoints2d(data, i, r_U, r_X, r_C); + } else if (BASIS_DIM == 3) { + InterpTransposeAtPoints3d(data, i, r_U, r_X, r_C); + } + } + + // Map from coefficients + if (BASIS_DIM == 1) { + InterpTranspose1d(data, r_C, s_B, r_V); + SumElementStrided1d(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V); + } else if (BASIS_DIM == 2) { + InterpTransposeTensor2d(data, r_C, s_B, r_V); + SumElementStrided2d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V); + } else if (BASIS_DIM == 3) { + InterpTransposeTensor3d(data, r_C, s_B, r_V); + SumElementStrided3d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, + BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V); + } + } +} + +extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__ + void InterpTransposeAddAtPoints(const CeedInt num_elem, const CeedScalar *c_B, const CeedInt *points_per_elem, const CeedScalar *__restrict__ d_X, + const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { + extern __shared__ CeedScalar slice[]; + + SharedData_Hip data; + data.t_id_x = threadIdx.x; + data.t_id_y = threadIdx.y; + data.t_id_z = threadIdx.z; + data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; + data.slice = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1); + + CeedScalar r_X[BASIS_DIM]; + CeedScalar r_U[BASIS_NUM_COMP]; + CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; + CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; + + // load chebyshev_interp_1d into shared memory + __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; + LoadMatrix(data, c_B, s_B); + __syncthreads(); + + // Apply basis element by element + for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { + // Clear register + for (CeedInt i = 0; i < BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1); i++) r_C[i] = 0.0; + + // Map from points + const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y)); + + for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) { + const CeedInt p = i % BASIS_NUM_PTS; + + ReadPoint(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_X, r_X); + ReadPoint(data, elem, i, points_per_elem[elem], 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_U, r_U); + if (BASIS_DIM == 1) { + InterpTransposeAtPoints1d(data, i, r_U, r_X, r_C); + } else if (BASIS_DIM == 2) { + InterpTransposeAtPoints2d(data, i, r_U, r_X, r_C); + } else if (BASIS_DIM == 3) { + InterpTransposeAtPoints3d(data, i, r_U, r_X, r_C); + } + } + + // Map from coefficients + if (BASIS_DIM == 1) { + InterpTranspose1d(data, r_C, s_B, r_V); + SumElementStrided1d(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V); + } else if (BASIS_DIM == 2) { + InterpTransposeTensor2d(data, r_C, s_B, r_V); + SumElementStrided2d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V); + } else if (BASIS_DIM == 3) { + InterpTransposeTensor3d(data, r_C, s_B, r_V); + SumElementStrided3d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, + BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V); + } + } +} + +//------------------------------------------------------------------------------ +// Grad +//------------------------------------------------------------------------------ +extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__ + void GradAtPoints(const CeedInt num_elem, const CeedScalar *c_B, const CeedInt *points_per_elem, const CeedScalar *__restrict__ d_X, + const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { + extern __shared__ CeedScalar slice[]; + + SharedData_Hip data; + data.t_id_x = threadIdx.x; + data.t_id_y = threadIdx.y; + data.t_id_z = threadIdx.z; + data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; + data.slice = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1); + + CeedScalar r_X[BASIS_DIM]; + CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)]; + CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; + CeedScalar r_V[BASIS_NUM_COMP * BASIS_DIM]; + + // load chebyshev_interp_1d into shared memory + __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; + LoadMatrix(data, c_B, s_B); + __syncthreads(); + + // Apply basis element by element + for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { + // Map to coefficients + if (BASIS_DIM == 1) { + ReadElementStrided1d(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U); + Interp1d(data, r_U, s_B, r_C); + } else if (BASIS_DIM == 2) { + ReadElementStrided2d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U); + InterpTensor2d(data, r_U, s_B, r_C); + } else if (BASIS_DIM == 3) { + ReadElementStrided3d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, + BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U); + InterpTensor3d(data, r_U, s_B, r_C); + } + + // Map to points + const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y)); + + for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) { + const CeedInt p = i % BASIS_NUM_PTS; + + ReadPoint(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_X, r_X); + if (BASIS_DIM == 1) { + GradAtPoints1d(data, i, r_C, r_X, r_V); + } else if (BASIS_DIM == 2) { + GradAtPoints2d(data, i, r_C, r_X, r_V); + } else if (BASIS_DIM == 3) { + GradAtPoints3d(data, i, r_C, r_X, r_V); + } + WritePoint(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, r_V, d_V); + } + } +} + +extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__ + void GradTransposeAtPoints(const CeedInt num_elem, const CeedScalar *c_B, const CeedInt *points_per_elem, const CeedScalar *__restrict__ d_X, + const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { + extern __shared__ CeedScalar slice[]; + + SharedData_Hip data; + data.t_id_x = threadIdx.x; + data.t_id_y = threadIdx.y; + data.t_id_z = threadIdx.z; + data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; + data.slice = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1); + + CeedScalar r_X[BASIS_DIM]; + CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM]; + CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; + CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; + + // load chebyshev_interp_1d into shared memory + __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; + LoadMatrix(data, c_B, s_B); + __syncthreads(); + + // Apply basis element by element + for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { + // Clear register + for (CeedInt i = 0; i < BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1); i++) r_C[i] = 0.0; + + // Clear output vector + for (CeedInt i = 0; i < BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1); i++) r_V[i] = 0.0; + if (BASIS_DIM == 1) { + WriteElementStrided1d(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V); + } else if (BASIS_DIM == 2) { + WriteElementStrided2d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V); + } else if (BASIS_DIM == 3) { + WriteElementStrided3d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, + BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V); + } + + // Map from points + const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y)); + + for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) { + const CeedInt p = i % BASIS_NUM_PTS; + + ReadPoint(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_X, r_X); + ReadPoint(data, elem, i, points_per_elem[elem], 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_U, + r_U); + if (BASIS_DIM == 1) { + GradTransposeAtPoints1d(data, i, r_U, r_X, r_C); + } else if (BASIS_DIM == 2) { + GradTransposeAtPoints2d(data, i, r_U, r_X, r_C); + } else if (BASIS_DIM == 3) { + GradTransposeAtPoints3d(data, i, r_U, r_X, r_C); + } + } + + // Map from coefficients + if (BASIS_DIM == 1) { + InterpTranspose1d(data, r_C, s_B, r_V); + SumElementStrided1d(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V); + } else if (BASIS_DIM == 2) { + InterpTransposeTensor2d(data, r_C, s_B, r_V); + SumElementStrided2d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V); + } else if (BASIS_DIM == 3) { + InterpTransposeTensor3d(data, r_C, s_B, r_V); + SumElementStrided3d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, + BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V); + } + } +} + +extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__ + void GradTransposeAddAtPoints(const CeedInt num_elem, const CeedScalar *c_B, const CeedInt *points_per_elem, const CeedScalar *__restrict__ d_X, + const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { + extern __shared__ CeedScalar slice[]; + + SharedData_Hip data; + data.t_id_x = threadIdx.x; + data.t_id_y = threadIdx.y; + data.t_id_z = threadIdx.z; + data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; + data.slice = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1); + + CeedScalar r_X[BASIS_DIM]; + CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM]; + CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; + CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; + + // load chebyshev_interp_1d into shared memory + __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; + LoadMatrix(data, c_B, s_B); + __syncthreads(); + + // Apply basis element by element + for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { + // Clear register + for (CeedInt i = 0; i < BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1); i++) r_C[i] = 0.0; + + // Map from points + const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y)); + + for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) { + const CeedInt p = i % BASIS_NUM_PTS; + + ReadPoint(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_X, r_X); + ReadPoint(data, elem, i, points_per_elem[elem], 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_U, + r_U); + if (BASIS_DIM == 1) { + GradTransposeAtPoints1d(data, i, r_U, r_X, r_C); + } else if (BASIS_DIM == 2) { + GradTransposeAtPoints2d(data, i, r_U, r_X, r_C); + } else if (BASIS_DIM == 3) { + GradTransposeAtPoints3d(data, i, r_U, r_X, r_C); + } + } + + // Map from coefficients + if (BASIS_DIM == 1) { + InterpTranspose1d(data, r_C, s_B, r_V); + SumElementStrided1d(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V); + } else if (BASIS_DIM == 2) { + InterpTransposeTensor2d(data, r_C, s_B, r_V); + SumElementStrided2d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V); + } else if (BASIS_DIM == 3) { + InterpTransposeTensor3d(data, r_C, s_B, r_V); + SumElementStrided3d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, + BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V); + } + } +} diff --git a/include/ceed/jit-source/hip/hip-shared-basis-tensor-flattened-templates.h b/include/ceed/jit-source/hip/hip-shared-basis-tensor-flattened-templates.h new file mode 100644 index 0000000000..bba3c2f8a1 --- /dev/null +++ b/include/ceed/jit-source/hip/hip-shared-basis-tensor-flattened-templates.h @@ -0,0 +1,677 @@ +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. +// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +// +// SPDX-License-Identifier: BSD-2-Clause +// +// This file is part of CEED: http://github.com/ceed + +/// @file +/// Internal header for HIP shared memory tensor product basis templates +#include + +//------------------------------------------------------------------------------ +// 2D +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ +// 2D tensor contraction x +//------------------------------------------------------------------------------ +template +inline __device__ void ContractX2dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B, + CeedScalar *V) { + __syncthreads(); + data.slice[t_id_x + t_id_y * T_1D] = *U; + __syncthreads(); + *V = 0.0; + if (t_id_x < Q_1D && t_id_y < P_1D) { + for (CeedInt i = 0; i < P_1D; i++) { + *V += B[i + t_id_x * P_1D] * data.slice[i + t_id_y * T_1D]; // Contract x direction + } + } +} + +//------------------------------------------------------------------------------ +// 2D tensor contract y +//------------------------------------------------------------------------------ +template +inline __device__ void ContractY2dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B, + CeedScalar *V) { + __syncthreads(); + data.slice[t_id_x + t_id_y * T_1D] = *U; + __syncthreads(); + *V = 0.0; + if (t_id_x < Q_1D && t_id_y < Q_1D) { + for (CeedInt i = 0; i < P_1D; i++) { + *V += B[i + t_id_y * P_1D] * data.slice[t_id_x + i * T_1D]; // Contract y direction + } + } +} + +//------------------------------------------------------------------------------ +// 2D transpose tensor contract y +//------------------------------------------------------------------------------ +template +inline __device__ void ContractTransposeY2dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *U, + const CeedScalar *B, CeedScalar *V) { + __syncthreads(); + data.slice[t_id_x + t_id_y * T_1D] = *U; + __syncthreads(); + *V = 0.0; + if (t_id_x < Q_1D && t_id_y < P_1D) { + for (CeedInt i = 0; i < Q_1D; i++) { + *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D]; // Contract y direction + } + } +} + +//------------------------------------------------------------------------------ +// 2D transpose tensor contract x +//------------------------------------------------------------------------------ +template +inline __device__ void ContractTransposeX2dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *U, + const CeedScalar *B, CeedScalar *V) { + __syncthreads(); + data.slice[t_id_x + t_id_y * T_1D] = *U; + __syncthreads(); + *V = 0.0; + if (t_id_x < P_1D && t_id_y < P_1D) { + for (CeedInt i = 0; i < Q_1D; i++) { + *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D]; // Contract x direction + } + } +} + +//------------------------------------------------------------------------------ +// 2D transpose tensor contract and add x +//------------------------------------------------------------------------------ +template +inline __device__ void ContractTransposeAddX2dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *U, + const CeedScalar *B, CeedScalar *V) { + __syncthreads(); + data.slice[t_id_x + t_id_y * T_1D] = *U; + __syncthreads(); + if (t_id_x < P_1D && t_id_y < P_1D) { + for (CeedInt i = 0; i < Q_1D; i++) { + *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D]; // Contract x direction + } + } +} + +//------------------------------------------------------------------------------ +// 2D pack/unpack quadrature values +//------------------------------------------------------------------------------ +template +inline __device__ void QPack2d(SharedData_Hip &data, const int t_id_x, const int t_id_y, CeedScalar *U) { + const CeedInt new_t_id_x = data.t_id_x % Q_1D, new_t_id_y = data.t_id_x / Q_1D; + + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + __syncthreads(); + if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice[t_id_x + t_id_y * T_1D] = U[comp]; + __syncthreads(); + U[comp] = data.t_id_x < (Q_1D * Q_1D) ? data.slice[new_t_id_x + new_t_id_y * T_1D] : 0.0; + } +} + +template +inline __device__ void QUnpack2d(SharedData_Hip &data, const int t_id_x, const int t_id_y, CeedScalar *U) { + const CeedInt old_t_id_x = data.t_id_x % Q_1D, old_t_id_y = data.t_id_x / Q_1D; + + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + __syncthreads(); + if (data.t_id_x < (Q_1D * Q_1D)) data.slice[old_t_id_x + old_t_id_y * T_1D] = U[comp]; + __syncthreads(); + U[comp] = (t_id_x < Q_1D && t_id_y < Q_1D) ? data.slice[t_id_x + t_id_y * T_1D] : 0.0; + } +} + +//------------------------------------------------------------------------------ +// 2D interpolate to quadrature points +//------------------------------------------------------------------------------ +template +inline __device__ void InterpTensor2dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, + CeedScalar *__restrict__ r_V) { + const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D; + CeedScalar r_t[1]; + + if (P_1D != T_1D) QUnpack2d(data, t_id_x, t_id_y, r_U); + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + ContractX2dFlattened(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t); + ContractY2dFlattened(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]); + } + __syncthreads(); + if (P_1D != T_1D) QPack2d(data, t_id_x, t_id_y, r_U); + if (Q_1D != T_1D) QPack2d(data, t_id_x, t_id_y, r_V); +} + +//------------------------------------------------------------------------------ +// 2D interpolate transpose +//------------------------------------------------------------------------------ +template +inline __device__ void InterpTransposeTensor2dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, + CeedScalar *__restrict__ r_V) { + const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D; + CeedScalar r_t[1]; + + if (Q_1D != T_1D) QUnpack2d(data, t_id_x, t_id_y, r_U); + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + ContractTransposeY2dFlattened(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t); + ContractTransposeX2dFlattened(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]); + } + __syncthreads(); + if (P_1D != T_1D) QPack2d(data, t_id_x, t_id_y, r_V); +} + +//------------------------------------------------------------------------------ +// 2D interpolate to quadrature points, nodes and quadrature points collocated +//------------------------------------------------------------------------------ +template +inline __device__ void InterpTensorCollocatedNodes2dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, + CeedScalar *__restrict__ r_V) { + const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D; + + if (P_1D != T_1D) QUnpack2d(data, t_id_x, t_id_y, r_U); + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + r_V[comp] = r_U[comp]; + } + __syncthreads(); + if (P_1D != T_1D) QPack2d(data, t_id_x, t_id_y, r_U); + if (Q_1D != T_1D) QPack2d(data, t_id_x, t_id_y, r_V); +} + +//------------------------------------------------------------------------------ +// 2D interpolate transpose, nodes and quadrature points collocated +//------------------------------------------------------------------------------ +template +inline __device__ void InterpTransposeTensorCollocatedNodes2dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, + CeedScalar *__restrict__ r_V) { + const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D; + + if (Q_1D != T_1D) QUnpack2d(data, t_id_x, t_id_y, r_U); + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + r_V[comp] = r_U[comp]; + } + __syncthreads(); + if (P_1D != T_1D) QPack2d(data, t_id_x, t_id_y, r_V); +} + +//------------------------------------------------------------------------------ +// 2D derivatives at quadrature points +//------------------------------------------------------------------------------ +template +inline __device__ void GradTensor2dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, + CeedScalar *__restrict__ r_V) { + const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D; + CeedScalar r_t[1]; + + if (P_1D != T_1D) QUnpack2d(data, t_id_x, t_id_y, r_U); + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + ContractX2dFlattened(data, t_id_x, t_id_y, &r_U[comp], c_G, r_t); + ContractY2dFlattened(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp + 0 * NUM_COMP]); + ContractX2dFlattened(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t); + ContractY2dFlattened(data, t_id_x, t_id_y, r_t, c_G, &r_V[comp + 1 * NUM_COMP]); + } + __syncthreads(); + if (P_1D != T_1D) QPack2d(data, t_id_x, t_id_y, r_U); + if (Q_1D != T_1D) QPack2d(data, t_id_x, t_id_y, r_V); +} + +//------------------------------------------------------------------------------ +// 2D derivatives transpose +//------------------------------------------------------------------------------ +template +inline __device__ void GradTransposeTensor2dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, + const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { + const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D; + CeedScalar r_t[1]; + + if (Q_1D != T_1D) QUnpack2d(data, t_id_x, t_id_y, r_U); + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + ContractTransposeY2dFlattened(data, t_id_x, t_id_y, &r_U[comp + 0 * NUM_COMP], c_B, r_t); + ContractTransposeX2dFlattened(data, t_id_x, t_id_y, r_t, c_G, &r_V[comp]); + ContractTransposeY2dFlattened(data, t_id_x, t_id_y, &r_U[comp + 1 * NUM_COMP], c_G, r_t); + ContractTransposeAddX2dFlattened(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]); + } + __syncthreads(); + if (P_1D != T_1D) QPack2d(data, t_id_x, t_id_y, r_V); +} + +//------------------------------------------------------------------------------ +// 2D derivatives at quadrature points, nodes and quadrature points collocated +//------------------------------------------------------------------------------ +template +inline __device__ void GradTensorCollocatedNodes2dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, + const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { + const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D; + + if (P_1D != T_1D) QUnpack2d(data, t_id_x, t_id_y, r_U); + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + ContractX2dFlattened(data, t_id_x, t_id_y, &r_U[comp], c_G, &r_V[comp + 0 * NUM_COMP]); + ContractY2dFlattened(data, t_id_x, t_id_y, &r_U[comp], c_G, &r_V[comp + 1 * NUM_COMP]); + } + __syncthreads(); + if (P_1D != T_1D) QPack2d(data, t_id_x, t_id_y, r_U); + if (Q_1D != T_1D) QPack2d(data, t_id_x, t_id_y, r_V); +} + +//------------------------------------------------------------------------------ +// 2D derivatives transpose, nodes and quadrature points collocated +//------------------------------------------------------------------------------ +template +inline __device__ void GradTransposeTensorCollocatedNodes2dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, + const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { + const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D; + + if (Q_1D != T_1D) QUnpack2d(data, t_id_x, t_id_y, r_U); + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + ContractTransposeY2dFlattened(data, t_id_x, t_id_y, &r_U[comp + 1 * NUM_COMP], c_G, &r_V[comp]); + ContractTransposeAddX2dFlattened(data, t_id_x, t_id_y, &r_U[comp + 0 * NUM_COMP], c_G, &r_V[comp]); + } + __syncthreads(); + if (P_1D != T_1D) QPack2d(data, t_id_x, t_id_y, r_V); +} + +//------------------------------------------------------------------------------ +// 2D quadrature weights +//------------------------------------------------------------------------------ +template +inline __device__ void WeightTensor2dFlattened(SharedData_Hip &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) { + const int t_id_x = data.t_id_x % Q_1D, t_id_y = data.t_id_x / Q_1D; + + *w = (t_id_x < Q_1D && t_id_y < Q_1D) ? q_weight_1d[t_id_x] * q_weight_1d[t_id_y] : 0.0; +} + +//------------------------------------------------------------------------------ +// 3D +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ +// 3D tensor contract x +//------------------------------------------------------------------------------ +template +inline __device__ void ContractX3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U, + const CeedScalar *B, CeedScalar *V) { + __syncthreads(); + data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; + __syncthreads(); + *V = 0.0; + if (t_id_x < Q_1D && t_id_y < P_1D && t_id_z < P_1D) { + for (CeedInt i = 0; i < P_1D; i++) { + *V += B[i + t_id_x * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D]; // Contract x direction + } + } +} + +//------------------------------------------------------------------------------ +// 3D tensor contract y +//------------------------------------------------------------------------------ +template +inline __device__ void ContractY3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U, + const CeedScalar *B, CeedScalar *V) { + __syncthreads(); + data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; + __syncthreads(); + *V = 0.0; + if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < P_1D) { + for (CeedInt i = 0; i < P_1D; i++) { + *V += B[i + t_id_y * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D]; // Contract y direction + } + } +} + +//------------------------------------------------------------------------------ +// 3D tensor contract z +//------------------------------------------------------------------------------ +template +inline __device__ void ContractZ3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U, + const CeedScalar *B, CeedScalar *V) { + __syncthreads(); + data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; + __syncthreads(); + *V = 0.0; + if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < Q_1D) { + for (CeedInt i = 0; i < P_1D; i++) { + *V += B[i + t_id_z * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D]; // Contract z direction + } + } +} + +//------------------------------------------------------------------------------ +// 3D tensor contract z +//------------------------------------------------------------------------------ +template +inline __device__ void ContractTransposeZ3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U, + const CeedScalar *B, CeedScalar *V) { + __syncthreads(); + data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; + __syncthreads(); + *V = 0.0; + if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < P_1D) { + for (CeedInt i = 0; i < Q_1D; i++) { + *V += B[t_id_z + i * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D]; // Contract z direction + } + } +} + +//------------------------------------------------------------------------------ +// 3D transpose tensor contract z +//------------------------------------------------------------------------------ +template +inline __device__ void ContractTransposeAddZ3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, + const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { + __syncthreads(); + data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; + __syncthreads(); + if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < P_1D) { + for (CeedInt i = 0; i < Q_1D; i++) { + *V += B[t_id_z + i * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D]; // Contract z direction + } + } +} + +//------------------------------------------------------------------------------ +// 3D transpose tensor contract y +//------------------------------------------------------------------------------ +template +inline __device__ void ContractTransposeY3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U, + const CeedScalar *B, CeedScalar *V) { + __syncthreads(); + data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; + __syncthreads(); + *V = 0.0; + if (t_id_x < Q_1D && t_id_y < P_1D && t_id_z < P_1D) { + for (CeedInt i = 0; i < Q_1D; i++) { + *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D]; // Contract y direction + } + } +} + +//------------------------------------------------------------------------------ +// 3D transpose tensor contract y +//------------------------------------------------------------------------------ +template +inline __device__ void ContractTransposeAddY3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, + const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { + __syncthreads(); + data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; + __syncthreads(); + if (t_id_x < Q_1D && t_id_y < P_1D && t_id_z < P_1D) { + for (CeedInt i = 0; i < Q_1D; i++) { + *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D]; // Contract y direction + } + } +} + +//------------------------------------------------------------------------------ +// 3D transpose tensor contract x +//------------------------------------------------------------------------------ +template +inline __device__ void ContractTransposeX3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U, + const CeedScalar *B, CeedScalar *V) { + __syncthreads(); + data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; + __syncthreads(); + *V = 0.0; + if (t_id_x < P_1D && t_id_y < P_1D && t_id_z < P_1D) { + for (CeedInt i = 0; i < Q_1D; i++) { + *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D]; // Contract x direction + } + } +} + +//------------------------------------------------------------------------------ +// 3D transpose tensor contract add x +//------------------------------------------------------------------------------ +template +inline __device__ void ContractTransposeAddX3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, + const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { + __syncthreads(); + data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; + __syncthreads(); + if (t_id_x < P_1D && t_id_y < P_1D && t_id_z < P_1D) { + for (CeedInt i = 0; i < Q_1D; i++) { + *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D]; // Contract x direction + } + } +} + +//------------------------------------------------------------------------------ +// 3D pack/unpack quadrature values +//------------------------------------------------------------------------------ +template +inline __device__ void QPack3d(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, CeedScalar *U) { + const CeedInt new_t_id_x = data.t_id_x % Q_1D, new_t_id_y = (data.t_id_x / Q_1D) % Q_1D, new_t_id_z = data.t_id_x / (Q_1D * Q_1D); + + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + __syncthreads(); + if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = U[comp]; + __syncthreads(); + U[comp] = data.t_id_x < (Q_1D * Q_1D * Q_1D) ? data.slice[new_t_id_x + new_t_id_y * T_1D + new_t_id_z * T_1D * T_1D] : 0.0; + } +} + +template +inline __device__ void QUnpack3d(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, CeedScalar *U) { + const CeedInt old_t_id_x = data.t_id_x % Q_1D, old_t_id_y = (data.t_id_x / Q_1D) % Q_1D, old_t_id_z = data.t_id_x / (Q_1D * Q_1D); + + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + __syncthreads(); + if (data.t_id_x < Q_1D * Q_1D * Q_1D) data.slice[old_t_id_x + old_t_id_y * T_1D + old_t_id_z * T_1D * T_1D] = U[comp]; + __syncthreads(); + U[comp] = (t_id_x < Q_1D && t_id_y < Q_1D) ? data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] : 0.0; + } +} + +//------------------------------------------------------------------------------ +// 3D interpolate to quadrature points +//------------------------------------------------------------------------------ +template +inline __device__ void InterpTensor3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, + CeedScalar *__restrict__ r_V) { + const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); + CeedScalar r_t1[1], r_t2[1]; + + if (P_1D != T_1D) QUnpack3d(data, t_id_x, t_id_y, t_id_z, r_U); + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + ContractX3dFlattened(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1); + ContractY3dFlattened(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2); + ContractZ3dFlattened(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]); + } + __syncthreads(); + if (P_1D != T_1D) QPack3d(data, t_id_x, t_id_y, t_id_z, r_U); + if (Q_1D != T_1D) QPack3d(data, t_id_x, t_id_y, t_id_z, r_V); +} + +//------------------------------------------------------------------------------ +// 3D interpolate transpose +//------------------------------------------------------------------------------ +template +inline __device__ void InterpTransposeTensor3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, + CeedScalar *__restrict__ r_V) { + const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); + CeedScalar r_t1[1], r_t2[1]; + + if (Q_1D != T_1D) QUnpack3d(data, t_id_x, t_id_y, t_id_z, r_U); + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + ContractTransposeZ3dFlattened(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1); + ContractTransposeY3dFlattened(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2); + ContractTransposeX3dFlattened(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]); + } + __syncthreads(); + if (P_1D != T_1D) QPack3d(data, t_id_x, t_id_y, t_id_z, r_V); +} + +//------------------------------------------------------------------------------ +// 3D interpolate to quadrature points, nodes and quadrature points collocated +//------------------------------------------------------------------------------ +template +inline __device__ void InterpTensorCollocatedNodes3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, + CeedScalar *__restrict__ r_V) { + const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); + + if (P_1D != T_1D) QUnpack3d(data, t_id_x, t_id_y, t_id_z, r_U); + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + r_V[comp] = r_U[comp]; + } + __syncthreads(); + if (P_1D != T_1D) QPack3d(data, t_id_x, t_id_y, t_id_z, r_U); + if (Q_1D != T_1D) QPack3d(data, t_id_x, t_id_y, t_id_z, r_V); +} + +//------------------------------------------------------------------------------ +// 3D interpolate transpose, nodes and quadrature points collocated +//------------------------------------------------------------------------------ +template +inline __device__ void InterpTransposeTensorCollocatedNodes3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, + CeedScalar *__restrict__ r_V) { + const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); + + if (Q_1D != T_1D) QUnpack3d(data, t_id_x, t_id_y, t_id_z, r_U); + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + r_V[comp] = r_U[comp]; + } + __syncthreads(); + if (P_1D != T_1D) QPack3d(data, t_id_x, t_id_y, t_id_z, r_V); +} + +//------------------------------------------------------------------------------ +// 3D derivatives at quadrature points +//------------------------------------------------------------------------------ +template +inline __device__ void GradTensor3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, + CeedScalar *__restrict__ r_V) { + const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); + CeedScalar r_t1[1], r_t2[1]; + + if (P_1D != T_1D) QUnpack3d(data, t_id_x, t_id_y, t_id_z, r_U); + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + ContractX3dFlattened(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_G, r_t1); + ContractY3dFlattened(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2); + ContractZ3dFlattened(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp + 0 * NUM_COMP]); + ContractX3dFlattened(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1); + ContractY3dFlattened(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, r_t2); + ContractZ3dFlattened(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp + 1 * NUM_COMP]); + ContractX3dFlattened(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1); + ContractY3dFlattened(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2); + ContractZ3dFlattened(data, t_id_x, t_id_y, t_id_z, r_t2, c_G, &r_V[comp + 2 * NUM_COMP]); + } + __syncthreads(); + if (P_1D != T_1D) QPack3d(data, t_id_x, t_id_y, t_id_z, r_U); + if (Q_1D != T_1D) QPack3d(data, t_id_x, t_id_y, t_id_z, r_V); +} + +//------------------------------------------------------------------------------ +// 3D derivatives transpose +//------------------------------------------------------------------------------ +template +inline __device__ void GradTransposeTensor3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, + const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { + const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); + CeedScalar r_t1[1], r_t2[1]; + + if (Q_1D != T_1D) QUnpack3d(data, t_id_x, t_id_y, t_id_z, r_U); + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + ContractTransposeZ3dFlattened(data, &r_U[comp + 0 * NUM_COMP], c_B, r_t1); + ContractTransposeY3dFlattened(data, r_t1, c_B, r_t2); + ContractTransposeX3dFlattened(data, r_t2, c_G, &r_V[comp]); + ContractTransposeZ3dFlattened(data, &r_U[comp + 1 * NUM_COMP], c_B, r_t1); + ContractTransposeY3dFlattened(data, r_t1, c_G, r_t2); + ContractTransposeAddX3dFlattened(data, r_t2, c_B, &r_V[comp]); + ContractTransposeZ3dFlattened(data, &r_U[comp + 2 * NUM_COMP], c_G, r_t1); + ContractTransposeY3dFlattened(data, r_t1, c_B, r_t2); + ContractTransposeAddX3dFlattened(data, r_t2, c_B, &r_V[comp]); + } + __syncthreads(); + if (P_1D != T_1D) QPack3d(data, t_id_x, t_id_y, t_id_z, r_V); +} + +//------------------------------------------------------------------------------ +// 3D derivatives at quadrature points +//------------------------------------------------------------------------------ +template +inline __device__ void GradTensorCollocated3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, + const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { + const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); + CeedScalar r_t1[1], r_t2[1]; + + if (P_1D != T_1D) QUnpack3d(data, t_id_x, t_id_y, t_id_z, r_U); + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + ContractX3dFlattened(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1); + ContractY3dFlattened(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2); + ContractZ3dFlattened(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, r_t1); + ContractX3dFlattened(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 0 * NUM_COMP]); + ContractY3dFlattened(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 1 * NUM_COMP]); + ContractZ3dFlattened(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 2 * NUM_COMP]); + } + __syncthreads(); + if (P_1D != T_1D) QPack3d(data, t_id_x, t_id_y, t_id_z, r_U); + if (Q_1D != T_1D) QPack3d(data, t_id_x, t_id_y, t_id_z, r_V); +} + +//------------------------------------------------------------------------------ +// 3D derivatives transpose +//------------------------------------------------------------------------------ +template +inline __device__ void GradTransposeTensorCollocated3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, + const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { + const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); + CeedScalar r_t1[1], r_t2[1]; + + if (Q_1D != T_1D) QUnpack3d(data, t_id_x, t_id_y, t_id_z, r_U); + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + ContractTransposeZ3dFlattened(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 2 * NUM_COMP], c_G, r_t2); + ContractTransposeAddY3dFlattened(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 1 * NUM_COMP], c_G, r_t2); + ContractTransposeAddX3dFlattened(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 0 * NUM_COMP], c_G, r_t2); + ContractTransposeZ3dFlattened(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, r_t1); + ContractTransposeY3dFlattened(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2); + ContractTransposeX3dFlattened(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]); + } + __syncthreads(); + if (P_1D != T_1D) QPack3d(data, t_id_x, t_id_y, t_id_z, r_V); +} + +//------------------------------------------------------------------------------ +// 3D derivatives at quadrature points, nodes and quadrature points collocated +//------------------------------------------------------------------------------ +template +inline __device__ void GradTensorCollocatedNodes3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, + const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { + const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); + + if (P_1D != T_1D) QUnpack3d(data, t_id_x, t_id_y, t_id_z, r_U); + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + ContractX3dFlattened(data, t_id_x, t_id_y, t_id_z, r_U[comp], c_G, &r_V[comp + 0 * NUM_COMP]); + ContractY3dFlattened(data, t_id_x, t_id_y, t_id_z, r_U[comp], c_G, &r_V[comp + 1 * NUM_COMP]); + ContractZ3dFlattened(data, t_id_x, t_id_y, t_id_z, r_U[comp], c_G, &r_V[comp + 2 * NUM_COMP]); + } + __syncthreads(); + if (P_1D != T_1D) QPack3d(data, t_id_x, t_id_y, t_id_z, r_U); + if (Q_1D != T_1D) QPack3d(data, t_id_x, t_id_y, t_id_z, r_V); +} + +//------------------------------------------------------------------------------ +// 3D derivatives transpose, nodes and quadrature points collocated +//------------------------------------------------------------------------------ +template +inline __device__ void GradTransposeTensorCollocatedNodes3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, + const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { + const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); + + if (Q_1D != T_1D) QUnpack3d(data, t_id_x, t_id_y, t_id_z, r_U); + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + ContractTransposeZ3dFlattened(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 2 * NUM_COMP], c_G, &r_V[comp]); + ContractTransposeAddY3dFlattened(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 1 * NUM_COMP], c_G, &r_V[comp]); + ContractTransposeAddX3dFlattened(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 0 * NUM_COMP], c_G, &r_V[comp]); + } + __syncthreads(); + if (P_1D != T_1D) QPack3d(data, t_id_x, t_id_y, t_id_z, r_V); +} + +//------------------------------------------------------------------------------ +// 3D quadrature weights +//------------------------------------------------------------------------------ +template +inline __device__ void WeightTensor3dFlattened(SharedData_Hip &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) { + const CeedInt t_id_x = data.t_id_x % Q_1D, t_id_y = (data.t_id_x / Q_1D) % Q_1D, t_id_z = data.t_id_x / (Q_1D * Q_1D); + + *w = (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < Q_1D) ? q_weight_1d[t_id_x] * q_weight_1d[t_id_y] * q_weight_1d[t_id_z] : 0.0; +} diff --git a/include/ceed/jit-source/hip/hip-shared-basis-tensor-templates.h b/include/ceed/jit-source/hip/hip-shared-basis-tensor-templates.h index 5e52d1c829..ada945ed1e 100644 --- a/include/ceed/jit-source/hip/hip-shared-basis-tensor-templates.h +++ b/include/ceed/jit-source/hip/hip-shared-basis-tensor-templates.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -7,8 +7,7 @@ /// @file /// Internal header for HIP shared memory tensor product basis templates - -#include +#include //------------------------------------------------------------------------------ // 1D @@ -19,6 +18,7 @@ //------------------------------------------------------------------------------ template inline __device__ void ContractX1d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { + __syncthreads(); data.slice[data.t_id_x] = *U; __syncthreads(); *V = 0.0; @@ -27,7 +27,6 @@ inline __device__ void ContractX1d(SharedData_Hip &data, const CeedScalar *U, co *V += B[i + data.t_id_x * P_1D] * data.slice[i]; // Contract x direction } } - __syncthreads(); } //------------------------------------------------------------------------------ @@ -35,6 +34,7 @@ inline __device__ void ContractX1d(SharedData_Hip &data, const CeedScalar *U, co //------------------------------------------------------------------------------ template inline __device__ void ContractTransposeX1d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { + __syncthreads(); data.slice[data.t_id_x] = *U; __syncthreads(); *V = 0.0; @@ -43,56 +43,77 @@ inline __device__ void ContractTransposeX1d(SharedData_Hip &data, const CeedScal *V += B[data.t_id_x + i * P_1D] * data.slice[i]; // Contract x direction } } - __syncthreads(); } //------------------------------------------------------------------------------ // 1D interpolate to quadrature points //------------------------------------------------------------------------------ -template +template inline __device__ void Interp1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { for (CeedInt comp = 0; comp < NUM_COMP; comp++) { - ContractX1d(data, r_U + comp, c_B, r_V + comp); + ContractX1d(data, &r_U[comp], c_B, &r_V[comp]); } } //------------------------------------------------------------------------------ // 1D interpolate transpose //------------------------------------------------------------------------------ -template +template inline __device__ void InterpTranspose1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { for (CeedInt comp = 0; comp < NUM_COMP; comp++) { - ContractTransposeX1d(data, r_U + comp, c_B, r_V + comp); + ContractTransposeX1d(data, &r_U[comp], c_B, &r_V[comp]); + } +} + +//------------------------------------------------------------------------------ +// 1D interpolate to quadrature points, nodes and quadrature points collocated +//------------------------------------------------------------------------------ +template +inline __device__ void InterpCollocatedNodes1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, + CeedScalar *__restrict__ r_V) { + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + r_V[comp] = r_U[comp]; + } +} + +//------------------------------------------------------------------------------ +// 1D interpolate transpose, nodes and quadrature points collocated +//------------------------------------------------------------------------------ +template +inline __device__ void InterpTransposeCollocatedNodes1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, + CeedScalar *__restrict__ r_V) { + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + r_V[comp] = r_U[comp]; } } //------------------------------------------------------------------------------ // 1D derivatives at quadrature points //------------------------------------------------------------------------------ -template +template inline __device__ void Grad1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { for (CeedInt comp = 0; comp < NUM_COMP; comp++) { - ContractX1d(data, r_U + comp, c_G, r_V + comp); + ContractX1d(data, &r_U[comp], c_G, &r_V[comp]); } } //------------------------------------------------------------------------------ // 1D derivatives transpose //------------------------------------------------------------------------------ -template +template inline __device__ void GradTranspose1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { for (CeedInt comp = 0; comp < NUM_COMP; comp++) { - ContractTransposeX1d(data, r_U + comp, c_G, r_V + comp); + ContractTransposeX1d(data, &r_U[comp], c_G, &r_V[comp]); } } //------------------------------------------------------------------------------ // 1D quadrature weights //------------------------------------------------------------------------------ -template +template inline __device__ void Weight1d(SharedData_Hip &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) { *w = (data.t_id_x < Q_1D) ? q_weight_1d[data.t_id_x] : 0.0; } @@ -104,8 +125,9 @@ inline __device__ void Weight1d(SharedData_Hip &data, const CeedScalar *__restri //------------------------------------------------------------------------------ // 2D tensor contraction x //------------------------------------------------------------------------------ -template +template inline __device__ void ContractX2d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { + __syncthreads(); data.slice[data.t_id_x + data.t_id_y * T_1D] = *U; __syncthreads(); *V = 0.0; @@ -114,14 +136,14 @@ inline __device__ void ContractX2d(SharedData_Hip &data, const CeedScalar *U, co *V += B[i + data.t_id_x * P_1D] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction } } - __syncthreads(); } //------------------------------------------------------------------------------ // 2D tensor contract y //------------------------------------------------------------------------------ -template +template inline __device__ void ContractY2d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { + __syncthreads(); data.slice[data.t_id_x + data.t_id_y * T_1D] = *U; __syncthreads(); *V = 0.0; @@ -130,14 +152,14 @@ inline __device__ void ContractY2d(SharedData_Hip &data, const CeedScalar *U, co *V += B[i + data.t_id_y * P_1D] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction } } - __syncthreads(); } //------------------------------------------------------------------------------ // 2D transpose tensor contract y //------------------------------------------------------------------------------ -template +template inline __device__ void ContractTransposeY2d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { + __syncthreads(); data.slice[data.t_id_x + data.t_id_y * T_1D] = *U; __syncthreads(); *V = 0.0; @@ -146,14 +168,14 @@ inline __device__ void ContractTransposeY2d(SharedData_Hip &data, const CeedScal *V += B[data.t_id_y + i * P_1D] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction } } - __syncthreads(); } //------------------------------------------------------------------------------ // 2D transpose tensor contract x //------------------------------------------------------------------------------ -template +template inline __device__ void ContractTransposeX2d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { + __syncthreads(); data.slice[data.t_id_x + data.t_id_y * T_1D] = *U; __syncthreads(); *V = 0.0; @@ -162,14 +184,14 @@ inline __device__ void ContractTransposeX2d(SharedData_Hip &data, const CeedScal *V += B[data.t_id_x + i * P_1D] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction } } - __syncthreads(); } //------------------------------------------------------------------------------ // 2D transpose tensor contract and add x //------------------------------------------------------------------------------ -template +template inline __device__ void ContractTransposeAddX2d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { + __syncthreads(); data.slice[data.t_id_x + data.t_id_y * T_1D] = *U; __syncthreads(); if (data.t_id_x < P_1D && data.t_id_y < P_1D) { @@ -177,68 +199,113 @@ inline __device__ void ContractTransposeAddX2d(SharedData_Hip &data, const CeedS *V += B[data.t_id_x + i * P_1D] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction } } - __syncthreads(); } //------------------------------------------------------------------------------ // 2D interpolate to quadrature points //------------------------------------------------------------------------------ -template +template inline __device__ void InterpTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { CeedScalar r_t[1]; for (CeedInt comp = 0; comp < NUM_COMP; comp++) { - ContractX2d(data, r_U + comp, c_B, r_t); - ContractY2d(data, r_t, c_B, r_V + comp); + ContractX2d(data, &r_U[comp], c_B, r_t); + ContractY2d(data, r_t, c_B, &r_V[comp]); } } //------------------------------------------------------------------------------ // 2D interpolate transpose //------------------------------------------------------------------------------ -template +template inline __device__ void InterpTransposeTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { CeedScalar r_t[1]; for (CeedInt comp = 0; comp < NUM_COMP; comp++) { - ContractTransposeY2d(data, r_U + comp, c_B, r_t); - ContractTransposeX2d(data, r_t, c_B, r_V + comp); + ContractTransposeY2d(data, &r_U[comp], c_B, r_t); + ContractTransposeX2d(data, r_t, c_B, &r_V[comp]); + } +} + +//------------------------------------------------------------------------------ +// 2D interpolate to quadrature points, nodes and quadrature points collocated +//------------------------------------------------------------------------------ +template +inline __device__ void InterpTensorCollocatedNodes2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, + CeedScalar *__restrict__ r_V) { + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + r_V[comp] = r_U[comp]; + } +} + +//------------------------------------------------------------------------------ +// 2D interpolate transpose, nodes and quadrature points collocated +//------------------------------------------------------------------------------ +template +inline __device__ void InterpTransposeTensorCollocatedNodes2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, + CeedScalar *__restrict__ r_V) { + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + r_V[comp] = r_U[comp]; } } //------------------------------------------------------------------------------ // 2D derivatives at quadrature points //------------------------------------------------------------------------------ -template +template inline __device__ void GradTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { CeedScalar r_t[1]; for (CeedInt comp = 0; comp < NUM_COMP; comp++) { - ContractX2d(data, r_U + comp, c_G, r_t); - ContractY2d(data, r_t, c_B, r_V + comp + 0 * NUM_COMP); - ContractX2d(data, r_U + comp, c_B, r_t); - ContractY2d(data, r_t, c_G, r_V + comp + 1 * NUM_COMP); + ContractX2d(data, &r_U[comp], c_G, r_t); + ContractY2d(data, r_t, c_B, &r_V[comp + 0 * NUM_COMP]); + ContractX2d(data, &r_U[comp], c_B, r_t); + ContractY2d(data, r_t, c_G, &r_V[comp + 1 * NUM_COMP]); } } //------------------------------------------------------------------------------ // 2D derivatives transpose //------------------------------------------------------------------------------ -template +template inline __device__ void GradTransposeTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { CeedScalar r_t[1]; for (CeedInt comp = 0; comp < NUM_COMP; comp++) { - ContractTransposeY2d(data, r_U + comp + 0 * NUM_COMP, c_B, r_t); - ContractTransposeX2d(data, r_t, c_G, r_V + comp); - ContractTransposeY2d(data, r_U + comp + 1 * NUM_COMP, c_G, r_t); - ContractTransposeAddX2d(data, r_t, c_B, r_V + comp); + ContractTransposeY2d(data, &r_U[comp + 0 * NUM_COMP], c_B, r_t); + ContractTransposeX2d(data, r_t, c_G, &r_V[comp]); + ContractTransposeY2d(data, &r_U[comp + 1 * NUM_COMP], c_G, r_t); + ContractTransposeAddX2d(data, r_t, c_B, &r_V[comp]); + } +} + +//------------------------------------------------------------------------------ +// 2D derivatives at quadrature points, nodes and quadrature points collocated +//------------------------------------------------------------------------------ +template +inline __device__ void GradTensorCollocatedNodes2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, + const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + ContractX2d(data, &r_U[comp], c_G, &r_V[comp + 0 * NUM_COMP]); + ContractY2d(data, &r_U[comp], c_G, &r_V[comp + 1 * NUM_COMP]); + } +} + +//------------------------------------------------------------------------------ +// 2D derivatives transpose, nodes and quadrature points collocated +//------------------------------------------------------------------------------ +template +inline __device__ void GradTransposeTensorCollocatedNodes2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, + const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + ContractTransposeY2d(data, &r_U[comp + 1 * NUM_COMP], c_G, &r_V[comp]); + ContractTransposeAddX2d(data, &r_U[comp + 0 * NUM_COMP], c_G, &r_V[comp]); } } //------------------------------------------------------------------------------ // 2D quadrature weights //------------------------------------------------------------------------------ -template +template inline __device__ void WeightTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) { *w = (data.t_id_x < Q_1D && data.t_id_y < Q_1D) ? q_weight_1d[data.t_id_x] * q_weight_1d[data.t_id_y] : 0.0; } @@ -250,7 +317,7 @@ inline __device__ void WeightTensor2d(SharedData_Hip &data, const CeedScalar *__ //------------------------------------------------------------------------------ // 3D tensor contract x //------------------------------------------------------------------------------ -template +template inline __device__ void ContractX3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { CeedScalar r_B[P_1D]; for (CeedInt i = 0; i < P_1D; i++) { @@ -258,6 +325,7 @@ inline __device__ void ContractX3d(SharedData_Hip &data, const CeedScalar *U, co } for (CeedInt k = 0; k < P_1D; k++) { + __syncthreads(); data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k]; __syncthreads(); V[k] = 0.0; @@ -266,14 +334,13 @@ inline __device__ void ContractX3d(SharedData_Hip &data, const CeedScalar *U, co V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction } } - __syncthreads(); } } //------------------------------------------------------------------------------ // 3D tensor contract y //------------------------------------------------------------------------------ -template +template inline __device__ void ContractY3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { CeedScalar r_B[P_1D]; for (CeedInt i = 0; i < P_1D; i++) { @@ -281,6 +348,7 @@ inline __device__ void ContractY3d(SharedData_Hip &data, const CeedScalar *U, co } for (CeedInt k = 0; k < P_1D; k++) { + __syncthreads(); data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k]; __syncthreads(); V[k] = 0.0; @@ -289,14 +357,13 @@ inline __device__ void ContractY3d(SharedData_Hip &data, const CeedScalar *U, co V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction } } - __syncthreads(); } } //------------------------------------------------------------------------------ // 3D tensor contract z //------------------------------------------------------------------------------ -template +template inline __device__ void ContractZ3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { for (CeedInt k = 0; k < Q_1D; k++) { V[k] = 0.0; @@ -311,7 +378,7 @@ inline __device__ void ContractZ3d(SharedData_Hip &data, const CeedScalar *U, co //------------------------------------------------------------------------------ // 3D transpose tensor contract z //------------------------------------------------------------------------------ -template +template inline __device__ void ContractTransposeZ3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { for (CeedInt k = 0; k < P_1D; k++) { V[k] = 0.0; @@ -326,7 +393,7 @@ inline __device__ void ContractTransposeZ3d(SharedData_Hip &data, const CeedScal //------------------------------------------------------------------------------ // 3D transpose tensor contract y //------------------------------------------------------------------------------ -template +template inline __device__ void ContractTransposeY3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { CeedScalar r_B[Q_1D]; for (CeedInt i = 0; i < Q_1D; i++) { @@ -334,6 +401,7 @@ inline __device__ void ContractTransposeY3d(SharedData_Hip &data, const CeedScal } for (CeedInt k = 0; k < P_1D; k++) { + __syncthreads(); data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k]; __syncthreads(); V[k] = 0.0; @@ -342,14 +410,13 @@ inline __device__ void ContractTransposeY3d(SharedData_Hip &data, const CeedScal V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction } } - __syncthreads(); } } //------------------------------------------------------------------------------ // 3D transpose tensor contract y //------------------------------------------------------------------------------ -template +template inline __device__ void ContractTransposeAddY3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { CeedScalar r_B[Q_1D]; for (CeedInt i = 0; i < Q_1D; i++) { @@ -357,6 +424,7 @@ inline __device__ void ContractTransposeAddY3d(SharedData_Hip &data, const CeedS } for (CeedInt k = 0; k < P_1D; k++) { + __syncthreads(); data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k]; __syncthreads(); if (data.t_id_x < Q_1D && data.t_id_y < P_1D) { @@ -364,14 +432,13 @@ inline __device__ void ContractTransposeAddY3d(SharedData_Hip &data, const CeedS V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction } } - __syncthreads(); } } //------------------------------------------------------------------------------ // 3D transpose tensor contract x //------------------------------------------------------------------------------ -template +template inline __device__ void ContractTransposeX3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { CeedScalar r_B[Q_1D]; for (CeedInt i = 0; i < Q_1D; i++) { @@ -379,6 +446,7 @@ inline __device__ void ContractTransposeX3d(SharedData_Hip &data, const CeedScal } for (CeedInt k = 0; k < P_1D; k++) { + __syncthreads(); data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k]; __syncthreads(); V[k] = 0.0; @@ -387,14 +455,13 @@ inline __device__ void ContractTransposeX3d(SharedData_Hip &data, const CeedScal V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction } } - __syncthreads(); } } //------------------------------------------------------------------------------ // 3D transpose tensor contract add x //------------------------------------------------------------------------------ -template +template inline __device__ void ContractTransposeAddX3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { CeedScalar r_B[Q_1D]; for (CeedInt i = 0; i < Q_1D; i++) { @@ -402,6 +469,7 @@ inline __device__ void ContractTransposeAddX3d(SharedData_Hip &data, const CeedS } for (CeedInt k = 0; k < P_1D; k++) { + __syncthreads(); data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k]; __syncthreads(); if (data.t_id_x < P_1D && data.t_id_y < P_1D) { @@ -409,121 +477,172 @@ inline __device__ void ContractTransposeAddX3d(SharedData_Hip &data, const CeedS V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction } } - __syncthreads(); } } //------------------------------------------------------------------------------ // 3D interpolate to quadrature points //------------------------------------------------------------------------------ -template +template inline __device__ void InterpTensor3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { CeedScalar r_t1[T_1D]; CeedScalar r_t2[T_1D]; for (CeedInt comp = 0; comp < NUM_COMP; comp++) { - ContractX3d(data, r_U + comp * P_1D, c_B, r_t1); - ContractY3d(data, r_t1, c_B, r_t2); - ContractZ3d(data, r_t2, c_B, r_V + comp * Q_1D); + ContractX3d(data, &r_U[comp * P_1D], c_B, r_t1); + ContractY3d(data, r_t1, c_B, r_t2); + ContractZ3d(data, r_t2, c_B, &r_V[comp * Q_1D]); } } //------------------------------------------------------------------------------ // 3D interpolate transpose //------------------------------------------------------------------------------ -template +template inline __device__ void InterpTransposeTensor3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { CeedScalar r_t1[T_1D]; CeedScalar r_t2[T_1D]; for (CeedInt comp = 0; comp < NUM_COMP; comp++) { - ContractTransposeZ3d(data, r_U + comp * Q_1D, c_B, r_t1); - ContractTransposeY3d(data, r_t1, c_B, r_t2); - ContractTransposeX3d(data, r_t2, c_B, r_V + comp * P_1D); + ContractTransposeZ3d(data, &r_U[comp * Q_1D], c_B, r_t1); + ContractTransposeY3d(data, r_t1, c_B, r_t2); + ContractTransposeX3d(data, r_t2, c_B, &r_V[comp * P_1D]); + } +} + +//------------------------------------------------------------------------------ +// 3D interpolate to quadrature points, nodes and quadrature points collocated +//------------------------------------------------------------------------------ +template +inline __device__ void InterpTensorCollocatedNodes3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, + CeedScalar *__restrict__ r_V) { + for (CeedInt i = 0; i < Q_1D; i++) { + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + r_V[i + comp * Q_1D] = r_U[i + comp * P_1D]; + } + } +} + +//------------------------------------------------------------------------------ +// 3D interpolate transpose, nodes and quadrature points collocated +//------------------------------------------------------------------------------ +template +inline __device__ void InterpTransposeTensorCollocatedNodes3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, + CeedScalar *__restrict__ r_V) { + for (CeedInt i = 0; i < Q_1D; i++) { + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + r_V[i + comp * P_1D] = r_U[i + comp * Q_1D]; + } } } //------------------------------------------------------------------------------ // 3D derivatives at quadrature points //------------------------------------------------------------------------------ -template +template inline __device__ void GradTensor3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { CeedScalar r_t1[T_1D]; CeedScalar r_t2[T_1D]; for (CeedInt comp = 0; comp < NUM_COMP; comp++) { - ContractX3d(data, r_U + comp * P_1D, c_G, r_t1); - ContractY3d(data, r_t1, c_B, r_t2); - ContractZ3d(data, r_t2, c_B, r_V + comp * Q_1D + 0 * NUM_COMP * Q_1D); - ContractX3d(data, r_U + comp * P_1D, c_B, r_t1); - ContractY3d(data, r_t1, c_G, r_t2); - ContractZ3d(data, r_t2, c_B, r_V + comp * Q_1D + 1 * NUM_COMP * Q_1D); - ContractX3d(data, r_U + comp * P_1D, c_B, r_t1); - ContractY3d(data, r_t1, c_B, r_t2); - ContractZ3d(data, r_t2, c_G, r_V + comp * Q_1D + 2 * NUM_COMP * Q_1D); + ContractX3d(data, &r_U[comp * P_1D], c_G, r_t1); + ContractY3d(data, r_t1, c_B, r_t2); + ContractZ3d(data, r_t2, c_B, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]); + ContractX3d(data, &r_U[comp * P_1D], c_B, r_t1); + ContractY3d(data, r_t1, c_G, r_t2); + ContractZ3d(data, r_t2, c_B, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]); + ContractX3d(data, &r_U[comp * P_1D], c_B, r_t1); + ContractY3d(data, r_t1, c_B, r_t2); + ContractZ3d(data, r_t2, c_G, &r_V[comp * Q_1D + 2 * NUM_COMP * Q_1D]); } } //------------------------------------------------------------------------------ // 3D derivatives transpose //------------------------------------------------------------------------------ -template +template inline __device__ void GradTransposeTensor3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { CeedScalar r_t1[T_1D]; CeedScalar r_t2[T_1D]; for (CeedInt comp = 0; comp < NUM_COMP; comp++) { - ContractTransposeZ3d(data, r_U + comp * Q_1D + 0 * NUM_COMP * Q_1D, c_B, r_t1); - ContractTransposeY3d(data, r_t1, c_B, r_t2); - ContractTransposeX3d(data, r_t2, c_G, r_V + comp * P_1D); - ContractTransposeZ3d(data, r_U + comp * Q_1D + 1 * NUM_COMP * Q_1D, c_B, r_t1); - ContractTransposeY3d(data, r_t1, c_G, r_t2); - ContractTransposeAddX3d(data, r_t2, c_B, r_V + comp * P_1D); - ContractTransposeZ3d(data, r_U + comp * Q_1D + 2 * NUM_COMP * Q_1D, c_G, r_t1); - ContractTransposeY3d(data, r_t1, c_B, r_t2); - ContractTransposeAddX3d(data, r_t2, c_B, r_V + comp * P_1D); + ContractTransposeZ3d(data, &r_U[comp * Q_1D + 0 * NUM_COMP * Q_1D], c_B, r_t1); + ContractTransposeY3d(data, r_t1, c_B, r_t2); + ContractTransposeX3d(data, r_t2, c_G, &r_V[comp * P_1D]); + ContractTransposeZ3d(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_B, r_t1); + ContractTransposeY3d(data, r_t1, c_G, r_t2); + ContractTransposeAddX3d(data, r_t2, c_B, &r_V[comp * P_1D]); + ContractTransposeZ3d(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, r_t1); + ContractTransposeY3d(data, r_t1, c_B, r_t2); + ContractTransposeAddX3d(data, r_t2, c_B, &r_V[comp * P_1D]); } } //------------------------------------------------------------------------------ // 3D derivatives at quadrature points //------------------------------------------------------------------------------ -template +template inline __device__ void GradTensorCollocated3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { CeedScalar r_t1[T_1D]; CeedScalar r_t2[T_1D]; for (CeedInt comp = 0; comp < NUM_COMP; comp++) { - ContractX3d(data, r_U + comp * P_1D, c_B, r_t1); - ContractY3d(data, r_t1, c_B, r_t2); - ContractZ3d(data, r_t2, c_B, r_t1); - ContractX3d(data, r_t1, c_G, r_V + comp * Q_1D + 0 * NUM_COMP * Q_1D); - ContractY3d(data, r_t1, c_G, r_V + comp * Q_1D + 1 * NUM_COMP * Q_1D); - ContractZ3d(data, r_t1, c_G, r_V + comp * Q_1D + 2 * NUM_COMP * Q_1D); + ContractX3d(data, &r_U[comp * P_1D], c_B, r_t1); + ContractY3d(data, r_t1, c_B, r_t2); + ContractZ3d(data, r_t2, c_B, r_t1); + ContractX3d(data, r_t1, c_G, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]); + ContractY3d(data, r_t1, c_G, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]); + ContractZ3d(data, r_t1, c_G, &r_V[comp * Q_1D + 2 * NUM_COMP * Q_1D]); } } //------------------------------------------------------------------------------ // 3D derivatives transpose //------------------------------------------------------------------------------ -template +template inline __device__ void GradTransposeTensorCollocated3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { CeedScalar r_t1[T_1D]; CeedScalar r_t2[T_1D]; for (CeedInt comp = 0; comp < NUM_COMP; comp++) { - ContractTransposeZ3d(data, r_U + comp * Q_1D + 2 * NUM_COMP * Q_1D, c_G, r_t2); - ContractTransposeAddY3d(data, r_U + comp * Q_1D + 1 * NUM_COMP * Q_1D, c_G, r_t2); - ContractTransposeAddX3d(data, r_U + comp * Q_1D + 0 * NUM_COMP * Q_1D, c_G, r_t2); - ContractTransposeZ3d(data, r_t2, c_B, r_t1); - ContractTransposeY3d(data, r_t1, c_B, r_t2); - ContractTransposeX3d(data, r_t2, c_B, r_V + comp * P_1D); + ContractTransposeZ3d(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, r_t2); + ContractTransposeAddY3d(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_G, r_t2); + ContractTransposeAddX3d(data, &r_U[comp * Q_1D + 0 * NUM_COMP * Q_1D], c_G, r_t2); + ContractTransposeZ3d(data, r_t2, c_B, r_t1); + ContractTransposeY3d(data, r_t1, c_B, r_t2); + ContractTransposeX3d(data, r_t2, c_B, &r_V[comp * P_1D]); + } +} + +//------------------------------------------------------------------------------ +// 3D derivatives at quadrature points, nodes and quadrature points collocated +//------------------------------------------------------------------------------ +template +inline __device__ void GradTensorCollocatedNodes3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, + const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + ContractX3d(data, &r_U[comp * P_1D], c_G, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]); + ContractY3d(data, &r_U[comp * P_1D], c_G, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]); + ContractZ3d(data, &r_U[comp * P_1D], c_G, &r_V[comp * Q_1D + 2 * NUM_COMP * Q_1D]); + } +} + +//------------------------------------------------------------------------------ +// 3D derivatives transpose, nodes and quadrature points collocated +//------------------------------------------------------------------------------ +template +inline __device__ void GradTransposeTensorCollocatedNodes3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, + const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + ContractTransposeZ3d(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, &r_V[comp * P_1D]); + ContractTransposeAddY3d(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_G, &r_V[comp * P_1D]); + ContractTransposeAddX3d(data, &r_U[comp * Q_1D + 0 * NUM_COMP * Q_1D], c_G, &r_V[comp * P_1D]); } } //------------------------------------------------------------------------------ // 3D quadrature weights //------------------------------------------------------------------------------ -template +template inline __device__ void WeightTensor3d(SharedData_Hip &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) { const bool quad = (data.t_id_x < Q_1D && data.t_id_y < Q_1D); const CeedScalar pw = quad ? q_weight_1d[data.t_id_x] * q_weight_1d[data.t_id_y] : 0.0; diff --git a/include/ceed/jit-source/hip/hip-shared-basis-tensor.h b/include/ceed/jit-source/hip/hip-shared-basis-tensor.h index 0a9a1f3cee..9e1d3b5263 100644 --- a/include/ceed/jit-source/hip/hip-shared-basis-tensor.h +++ b/include/ceed/jit-source/hip/hip-shared-basis-tensor.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -7,8 +7,7 @@ /// @file /// Internal header for HIP shared memory tensor product basis - -#include +#include #include "hip-shared-basis-read-write-templates.h" #include "hip-shared-basis-tensor-templates.h" @@ -17,168 +16,702 @@ // Interp kernel by dim //------------------------------------------------------------------------------ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__ - void Interp(const CeedInt num_elem, const CeedScalar *d_interp_1d, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { + void Interp(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { extern __shared__ CeedScalar slice[]; - // load interp_1d into shared memory - __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; - loadMatrix(d_interp_1d, s_B); - __syncthreads(); - SharedData_Hip data; data.t_id_x = threadIdx.x; data.t_id_y = threadIdx.y; data.t_id_z = threadIdx.z; data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; - data.slice = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1); + data.slice = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1); CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)]; CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; + // load interp_1d into shared memory + __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; + LoadMatrix(data, c_B, s_B); + __syncthreads(); + + // Apply basis element by element +#ifdef __HIP_PLATFORM_SPIRV__ + CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; + if (BASIS_DIM == 1) { + if (elem < num_elem) ReadElementStrided1d(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U); + Interp1d(data, r_U, s_B, r_V); + if (elem < num_elem) WriteElementStrided1d(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, r_V, d_V); + } else if (BASIS_DIM == 2) { + if (elem < num_elem) ReadElementStrided2d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U); + InterpTensor2d(data, r_U, s_B, r_V); + if (elem < num_elem) WriteElementStrided2d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, r_V, d_V); + } else if (BASIS_DIM == 3) { + if (elem < num_elem) ReadElementStrided3d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, + BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U); + InterpTensor3d(data, r_U, s_B, r_V); + if (elem < num_elem) WriteElementStrided3d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem, + BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, r_V, d_V); + } +#else for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { if (BASIS_DIM == 1) { ReadElementStrided1d(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U); - Interp1d(data, r_U, s_B, r_V); + Interp1d(data, r_U, s_B, r_V); WriteElementStrided1d(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, r_V, d_V); } else if (BASIS_DIM == 2) { ReadElementStrided2d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U); - InterpTensor2d(data, r_U, s_B, r_V); + InterpTensor2d(data, r_U, s_B, r_V); WriteElementStrided2d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, r_V, d_V); } else if (BASIS_DIM == 3) { ReadElementStrided3d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U); - InterpTensor3d(data, r_U, s_B, r_V); + InterpTensor3d(data, r_U, s_B, r_V); WriteElementStrided3d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, r_V, d_V); } } +#endif } extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__ - void InterpTranspose(const CeedInt num_elem, const CeedScalar *d_interp_1d, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { + void InterpCollocated(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { extern __shared__ CeedScalar slice[]; - // load interp_1d into shared memory - __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; - loadMatrix(d_interp_1d, s_B); - __syncthreads(); + SharedData_Hip data; + data.t_id_x = threadIdx.x; + data.t_id_y = threadIdx.y; + data.t_id_z = threadIdx.z; + data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; + data.slice = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1); + + CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)]; + + // Apply basis element by element +#ifdef __HIP_PLATFORM_SPIRV__ + CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; + if (elem < num_elem) { + if (BASIS_DIM == 1) { + ReadElementStrided1d(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U); + WriteElementStrided1d(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, r_U, d_V); + } else if (BASIS_DIM == 2) { + ReadElementStrided2d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U); + WriteElementStrided2d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, r_U, d_V); + } else if (BASIS_DIM == 3) { + ReadElementStrided3d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, + BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U); + WriteElementStrided3d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem, + BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, r_U, d_V); + } + } +#else + for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { + if (BASIS_DIM == 1) { + ReadElementStrided1d(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U); + WriteElementStrided1d(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, r_U, d_V); + } else if (BASIS_DIM == 2) { + ReadElementStrided2d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U); + WriteElementStrided2d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, r_U, d_V); + } else if (BASIS_DIM == 3) { + ReadElementStrided3d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, + BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U); + WriteElementStrided3d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem, + BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, r_U, d_V); + } + } +#endif +} + +extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__ + void InterpTranspose(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { + extern __shared__ CeedScalar slice[]; SharedData_Hip data; data.t_id_x = threadIdx.x; data.t_id_y = threadIdx.y; data.t_id_z = threadIdx.z; data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; - data.slice = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1); + data.slice = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1); CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)]; + // load interp_1d into shared memory + __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; + LoadMatrix(data, c_B, s_B); + __syncthreads(); + + // Apply basis element by element +#ifdef __HIP_PLATFORM_SPIRV__ + CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; + if (BASIS_DIM == 1) { + if (elem < num_elem) ReadElementStrided1d(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U); + InterpTranspose1d(data, r_U, s_B, r_V); + if (elem < num_elem) WriteElementStrided1d(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V); + } else if (BASIS_DIM == 2) { + if (elem < num_elem) ReadElementStrided2d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U, r_U); + InterpTransposeTensor2d(data, r_U, s_B, r_V); + if (elem < num_elem) WriteElementStrided2d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V); + } else if (BASIS_DIM == 3) { + if (elem < num_elem) ReadElementStrided3d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem, + BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U); + InterpTransposeTensor3d(data, r_U, s_B, r_V); + if (elem < num_elem) WriteElementStrided3d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, + BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V); + } +#else for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { if (BASIS_DIM == 1) { ReadElementStrided1d(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U); - InterpTranspose1d(data, r_U, s_B, r_V); + InterpTranspose1d(data, r_U, s_B, r_V); WriteElementStrided1d(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V); } else if (BASIS_DIM == 2) { ReadElementStrided2d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U, r_U); - InterpTransposeTensor2d(data, r_U, s_B, r_V); + InterpTransposeTensor2d(data, r_U, s_B, r_V); WriteElementStrided2d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V); } else if (BASIS_DIM == 3) { ReadElementStrided3d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U); - InterpTransposeTensor3d(data, r_U, s_B, r_V); + InterpTransposeTensor3d(data, r_U, s_B, r_V); WriteElementStrided3d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V); } } +#endif +} + +extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__ + void InterpCollocatedTranspose(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { + extern __shared__ CeedScalar slice[]; + + SharedData_Hip data; + data.t_id_x = threadIdx.x; + data.t_id_y = threadIdx.y; + data.t_id_z = threadIdx.z; + data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; + data.slice = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1); + + CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; + + // Apply basis element by element +#ifdef __HIP_PLATFORM_SPIRV__ + CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; + if (elem < num_elem) { + if (BASIS_DIM == 1) { + ReadElementStrided1d(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U); + WriteElementStrided1d(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_U, d_V); + } else if (BASIS_DIM == 2) { + ReadElementStrided2d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U, r_U); + WriteElementStrided2d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_U, d_V); + } else if (BASIS_DIM == 3) { + ReadElementStrided3d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem, + BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U); + WriteElementStrided3d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, + BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_U, d_V); + } + } +#else + for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { + if (BASIS_DIM == 1) { + ReadElementStrided1d(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U); + WriteElementStrided1d(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_U, d_V); + } else if (BASIS_DIM == 2) { + ReadElementStrided2d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U, r_U); + WriteElementStrided2d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_U, d_V); + } else if (BASIS_DIM == 3) { + ReadElementStrided3d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem, + BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U); + WriteElementStrided3d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, + BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_U, d_V); + } + } +#endif +} + +extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__ + void InterpTransposeAdd(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { + extern __shared__ CeedScalar slice[]; + + SharedData_Hip data; + data.t_id_x = threadIdx.x; + data.t_id_y = threadIdx.y; + data.t_id_z = threadIdx.z; + data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; + data.slice = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1); + + CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; + CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)]; + + // load interp_1d into shared memory + __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; + LoadMatrix(data, c_B, s_B); + __syncthreads(); + + // Apply basis element by element +#ifdef __HIP_PLATFORM_SPIRV__ + CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; + if (BASIS_DIM == 1) { + if (elem < num_elem) ReadElementStrided1d(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U); + InterpTranspose1d(data, r_U, s_B, r_V); + if (elem < num_elem) SumElementStrided1d(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V); + } else if (BASIS_DIM == 2) { + if (elem < num_elem) ReadElementStrided2d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U, r_U); + InterpTransposeTensor2d(data, r_U, s_B, r_V); + if (elem < num_elem) SumElementStrided2d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V); + } else if (BASIS_DIM == 3) { + if (elem < num_elem) ReadElementStrided3d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem, + BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U); + InterpTransposeTensor3d(data, r_U, s_B, r_V); + if (elem < num_elem) SumElementStrided3d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, + BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V); + } +#else + for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { + if (BASIS_DIM == 1) { + ReadElementStrided1d(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U); + InterpTranspose1d(data, r_U, s_B, r_V); + SumElementStrided1d(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V); + } else if (BASIS_DIM == 2) { + ReadElementStrided2d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U, r_U); + InterpTransposeTensor2d(data, r_U, s_B, r_V); + SumElementStrided2d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V); + } else if (BASIS_DIM == 3) { + ReadElementStrided3d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem, + BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U); + InterpTransposeTensor3d(data, r_U, s_B, r_V); + SumElementStrided3d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, + BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V); + } + } +#endif +} + +extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__ + void InterpCollocatedTransposeAdd(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U, + CeedScalar *__restrict__ d_V) { + extern __shared__ CeedScalar slice[]; + + SharedData_Hip data; + data.t_id_x = threadIdx.x; + data.t_id_y = threadIdx.y; + data.t_id_z = threadIdx.z; + data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; + data.slice = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1); + + CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; + + // Apply basis element by element +#ifdef __HIP_PLATFORM_SPIRV__ + CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; + if (elem < num_elem) { + if (BASIS_DIM == 1) { + ReadElementStrided1d(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U); + SumElementStrided1d(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_U, d_V); + } else if (BASIS_DIM == 2) { + ReadElementStrided2d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U, r_U); + SumElementStrided2d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_U, d_V); + } else if (BASIS_DIM == 3) { + ReadElementStrided3d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem, + BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U); + SumElementStrided3d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, + BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_U, d_V); + } + } +#else + for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { + if (BASIS_DIM == 1) { + ReadElementStrided1d(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U); + SumElementStrided1d(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_U, d_V); + } else if (BASIS_DIM == 2) { + ReadElementStrided2d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U, r_U); + SumElementStrided2d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_U, d_V); + } else if (BASIS_DIM == 3) { + ReadElementStrided3d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem, + BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U); + SumElementStrided3d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, + BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_U, d_V); + } + } +#endif } //------------------------------------------------------------------------------ // Grad kernel by dim //------------------------------------------------------------------------------ -extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__ - void Grad(const CeedInt num_elem, const CeedScalar *d_interp_1d, const CeedScalar *d_grad_1d, const CeedScalar *__restrict__ d_U, - CeedScalar *__restrict__ d_V) { +extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__ void Grad(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *c_G, + const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { extern __shared__ CeedScalar slice[]; + SharedData_Hip data; + data.t_id_x = threadIdx.x; + data.t_id_y = threadIdx.y; + data.t_id_z = threadIdx.z; + data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; + data.slice = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1); + + CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)]; + CeedScalar r_V[BASIS_NUM_COMP * BASIS_DIM * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; + // load interp_1d and grad_1d into shared memory __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; - loadMatrix(d_interp_1d, s_B); + LoadMatrix(data, c_B, s_B); __shared__ CeedScalar s_G[BASIS_Q_1D * (BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D)]; - loadMatrix(d_grad_1d, s_G); + LoadMatrix(data, c_G, s_G); __syncthreads(); + // Apply basis element by element +#ifdef __HIP_PLATFORM_SPIRV__ + CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; + if (BASIS_DIM == 1) { + if (elem < num_elem) ReadElementStrided1d(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U); + Grad1d(data, r_U, s_B, s_G, r_V); + if (elem < num_elem) WriteElementStrided1d(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, r_V, d_V); + } else if (BASIS_DIM == 2) { + if (elem < num_elem) ReadElementStrided2d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U); + GradTensor2d(data, r_U, s_B, s_G, r_V); + if (elem < num_elem) WriteElementStrided2d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, r_V, + d_V); + } else if (BASIS_DIM == 3) { + if (elem < num_elem) ReadElementStrided3d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, + BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U); + if (BASIS_HAS_COLLOCATED_GRAD) GradTensorCollocated3d(data, r_U, s_B, s_G, r_V); + else GradTensor3d(data, r_U, s_B, s_G, r_V); + if (elem < num_elem) WriteElementStrided3d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem, + BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, r_V, d_V); + } +#else + for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { + if (BASIS_DIM == 1) { + ReadElementStrided1d(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U); + Grad1d(data, r_U, s_B, s_G, r_V); + WriteElementStrided1d(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, r_V, d_V); + } else if (BASIS_DIM == 2) { + ReadElementStrided2d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U); + GradTensor2d(data, r_U, s_B, s_G, r_V); + WriteElementStrided2d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, r_V, + d_V); + } else if (BASIS_DIM == 3) { + ReadElementStrided3d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, + BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U); + if (BASIS_HAS_COLLOCATED_GRAD) GradTensorCollocated3d(data, r_U, s_B, s_G, r_V); + else GradTensor3d(data, r_U, s_B, s_G, r_V); + WriteElementStrided3d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem, + BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, r_V, d_V); + } + } +#endif +} + +extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__ + void GradCollocated(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U, + CeedScalar *__restrict__ d_V) { + extern __shared__ CeedScalar slice[]; + SharedData_Hip data; data.t_id_x = threadIdx.x; data.t_id_y = threadIdx.y; data.t_id_z = threadIdx.z; data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; - data.slice = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1); + data.slice = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1); CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)]; CeedScalar r_V[BASIS_NUM_COMP * BASIS_DIM * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; + // load interp_1d and grad_1d into shared memory + __shared__ CeedScalar s_G[BASIS_Q_1D * (BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D)]; + LoadMatrix(data, c_G, s_G); + __syncthreads(); + + // Apply basis element by element +#ifdef __HIP_PLATFORM_SPIRV__ + CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; + if (BASIS_DIM == 1) { + if (elem < num_elem) ReadElementStrided1d(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U); + Grad1d(data, r_U, NULL, s_G, r_V); + if (elem < num_elem) WriteElementStrided1d(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, r_V, d_V); + } else if (BASIS_DIM == 2) { + if (elem < num_elem) ReadElementStrided2d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U); + GradTensorCollocatedNodes2d(data, r_U, NULL, s_G, r_V); + if (elem < num_elem) WriteElementStrided2d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, r_V, + d_V); + } else if (BASIS_DIM == 3) { + if (elem < num_elem) ReadElementStrided3d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, + BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U); + GradTensorCollocatedNodes3d(data, r_U, NULL, s_G, r_V); + if (elem < num_elem) WriteElementStrided3d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem, + BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, r_V, d_V); + } +#else for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { if (BASIS_DIM == 1) { ReadElementStrided1d(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U); - Grad1d(data, r_U, s_B, s_G, r_V); + Grad1d(data, r_U, NULL, s_G, r_V); WriteElementStrided1d(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, r_V, d_V); } else if (BASIS_DIM == 2) { ReadElementStrided2d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U); - GradTensor2d(data, r_U, s_B, s_G, r_V); + GradTensorCollocatedNodes2d(data, r_U, NULL, s_G, r_V); WriteElementStrided2d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, r_V, d_V); } else if (BASIS_DIM == 3) { ReadElementStrided3d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U); - if (BASIS_HAS_COLLOCATED_GRAD) GradTensorCollocated3d(data, r_U, s_B, s_G, r_V); - else GradTensor3d(data, r_U, s_B, s_G, r_V); + GradTensorCollocatedNodes3d(data, r_U, NULL, s_G, r_V); WriteElementStrided3d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, r_V, d_V); } } +#endif } extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__ - void GradTranspose(const CeedInt num_elem, const CeedScalar *d_interp_1d, const CeedScalar *d_grad_1d, const CeedScalar *__restrict__ d_U, + void GradTranspose(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { extern __shared__ CeedScalar slice[]; + SharedData_Hip data; + data.t_id_x = threadIdx.x; + data.t_id_y = threadIdx.y; + data.t_id_z = threadIdx.z; + data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; + data.slice = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1); + + CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; + CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)]; + // load interp_1d and grad_1d into shared memory __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; - loadMatrix(d_interp_1d, s_B); + LoadMatrix(data, c_B, s_B); __shared__ CeedScalar s_G[BASIS_Q_1D * (BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D)]; - loadMatrix(d_grad_1d, s_G); + LoadMatrix(data, c_G, s_G); __syncthreads(); + // Apply basis element by element +#ifdef __HIP_PLATFORM_SPIRV__ + CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; + if (BASIS_DIM == 1) { + if (elem < num_elem) ReadElementStrided1d(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U); + GradTranspose1d(data, r_U, s_B, s_G, r_V); + if (elem < num_elem) WriteElementStrided1d(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V); + } else if (BASIS_DIM == 2) { + if (elem < num_elem) ReadElementStrided2d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U, + r_U); + GradTransposeTensor2d(data, r_U, s_B, s_G, r_V); + if (elem < num_elem) WriteElementStrided2d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V); + } else if (BASIS_DIM == 3) { + if (elem < num_elem) ReadElementStrided3d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem, + BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U); + if (BASIS_HAS_COLLOCATED_GRAD) GradTransposeTensorCollocated3d(data, r_U, s_B, s_G, r_V); + else GradTransposeTensor3d(data, r_U, s_B, s_G, r_V); + if (elem < num_elem) WriteElementStrided3d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, + BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V); + } +#else + for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { + if (BASIS_DIM == 1) { + ReadElementStrided1d(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U); + GradTranspose1d(data, r_U, s_B, s_G, r_V); + WriteElementStrided1d(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V); + } else if (BASIS_DIM == 2) { + ReadElementStrided2d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U, + r_U); + GradTransposeTensor2d(data, r_U, s_B, s_G, r_V); + WriteElementStrided2d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V); + } else if (BASIS_DIM == 3) { + ReadElementStrided3d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem, + BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U); + if (BASIS_HAS_COLLOCATED_GRAD) GradTransposeTensorCollocated3d(data, r_U, s_B, s_G, r_V); + else GradTransposeTensor3d(data, r_U, s_B, s_G, r_V); + WriteElementStrided3d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, + BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V); + } + } +#endif +} + +extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__ + void GradCollocatedTranspose(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U, + CeedScalar *__restrict__ d_V) { + extern __shared__ CeedScalar slice[]; + SharedData_Hip data; data.t_id_x = threadIdx.x; data.t_id_y = threadIdx.y; data.t_id_z = threadIdx.z; data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; - data.slice = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1); + data.slice = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1); CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)]; + // load interp_1d and grad_1d into shared memory + __shared__ CeedScalar s_G[BASIS_Q_1D * (BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D)]; + LoadMatrix(data, c_G, s_G); + __syncthreads(); + + // Apply basis element by element +#ifdef __HIP_PLATFORM_SPIRV__ + CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; + if (BASIS_DIM == 1) { + if (elem < num_elem) ReadElementStrided1d(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U); + GradTranspose1d(data, r_U, NULL, s_G, r_V); + if (elem < num_elem) WriteElementStrided1d(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V); + } else if (BASIS_DIM == 2) { + if (elem < num_elem) ReadElementStrided2d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U, + r_U); + GradTransposeTensorCollocatedNodes2d(data, r_U, NULL, s_G, r_V); + if (elem < num_elem) WriteElementStrided2d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V); + } else if (BASIS_DIM == 3) { + if (elem < num_elem) ReadElementStrided3d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem, + BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U); + GradTransposeTensorCollocatedNodes3d(data, r_U, NULL, s_G, r_V); + if (elem < num_elem) WriteElementStrided3d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, + BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V); + } +#else for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { if (BASIS_DIM == 1) { ReadElementStrided1d(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U); - GradTranspose1d(data, r_U, s_B, s_G, r_V); + GradTranspose1d(data, r_U, NULL, s_G, r_V); WriteElementStrided1d(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V); } else if (BASIS_DIM == 2) { ReadElementStrided2d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U, r_U); - GradTransposeTensor2d(data, r_U, s_B, s_G, r_V); + GradTransposeTensorCollocatedNodes2d(data, r_U, NULL, s_G, r_V); WriteElementStrided2d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V); } else if (BASIS_DIM == 3) { ReadElementStrided3d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U); - if (BASIS_HAS_COLLOCATED_GRAD) GradTransposeTensorCollocated3d(data, r_U, s_B, s_G, r_V); - else GradTransposeTensor3d(data, r_U, s_B, s_G, r_V); + GradTransposeTensorCollocatedNodes3d(data, r_U, NULL, s_G, r_V); WriteElementStrided3d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V); } } +#endif +} + +extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__ + void GradTransposeAdd(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U, + CeedScalar *__restrict__ d_V) { + extern __shared__ CeedScalar slice[]; + + SharedData_Hip data; + data.t_id_x = threadIdx.x; + data.t_id_y = threadIdx.y; + data.t_id_z = threadIdx.z; + data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; + data.slice = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1); + + CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; + CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)]; + + // load interp_1d and grad_1d into shared memory + __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; + LoadMatrix(data, c_B, s_B); + __shared__ CeedScalar s_G[BASIS_Q_1D * (BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D)]; + LoadMatrix(data, c_G, s_G); + __syncthreads(); + + // Apply basis element by element +#ifdef __HIP_PLATFORM_SPIRV__ + CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; + if (BASIS_DIM == 1) { + if (elem < num_elem) ReadElementStrided1d(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U); + GradTranspose1d(data, r_U, s_B, s_G, r_V); + if (elem < num_elem) SumElementStrided1d(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V); + } else if (BASIS_DIM == 2) { + if (elem < num_elem) ReadElementStrided2d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U, + r_U); + GradTransposeTensor2d(data, r_U, s_B, s_G, r_V); + if (elem < num_elem) SumElementStrided2d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V); + } else if (BASIS_DIM == 3) { + if (elem < num_elem) ReadElementStrided3d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem, + BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U); + if (BASIS_HAS_COLLOCATED_GRAD) GradTransposeTensorCollocated3d(data, r_U, s_B, s_G, r_V); + else GradTransposeTensor3d(data, r_U, s_B, s_G, r_V); + if (elem < num_elem) SumElementStrided3d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, + BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V); + } +#else + for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { + if (BASIS_DIM == 1) { + ReadElementStrided1d(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U); + GradTranspose1d(data, r_U, s_B, s_G, r_V); + SumElementStrided1d(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V); + } else if (BASIS_DIM == 2) { + ReadElementStrided2d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U, + r_U); + GradTransposeTensor2d(data, r_U, s_B, s_G, r_V); + SumElementStrided2d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V); + } else if (BASIS_DIM == 3) { + ReadElementStrided3d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem, + BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U); + if (BASIS_HAS_COLLOCATED_GRAD) GradTransposeTensorCollocated3d(data, r_U, s_B, s_G, r_V); + else GradTransposeTensor3d(data, r_U, s_B, s_G, r_V); + SumElementStrided3d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, + BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V); + } + } +#endif +} + +extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__ + void GradCollocatedTransposeAdd(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U, + CeedScalar *__restrict__ d_V) { + extern __shared__ CeedScalar slice[]; + + SharedData_Hip data; + data.t_id_x = threadIdx.x; + data.t_id_y = threadIdx.y; + data.t_id_z = threadIdx.z; + data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; + data.slice = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1); + + CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; + CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)]; + + // load interp_1d and grad_1d into shared memory + __shared__ CeedScalar s_G[BASIS_Q_1D * (BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D)]; + LoadMatrix(data, c_G, s_G); + __syncthreads(); + + // Apply basis element by element +#ifdef __HIP_PLATFORM_SPIRV__ + CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; + if (BASIS_DIM == 1) { + if (elem < num_elem) ReadElementStrided1d(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U); + GradTranspose1d(data, r_U, NULL, s_G, r_V); + if (elem < num_elem) SumElementStrided1d(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V); + } else if (BASIS_DIM == 2) { + if (elem < num_elem) ReadElementStrided2d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U, + r_U); + GradTransposeTensorCollocatedNodes2d(data, r_U, NULL, s_G, r_V); + if (elem < num_elem) SumElementStrided2d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V); + } else if (BASIS_DIM == 3) { + if (elem < num_elem) ReadElementStrided3d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem, + BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U); + GradTransposeTensorCollocatedNodes3d(data, r_U, NULL, s_G, r_V); + if (elem < num_elem) SumElementStrided3d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, + BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V); + } +#else + for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { + if (BASIS_DIM == 1) { + ReadElementStrided1d(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U); + GradTranspose1d(data, r_U, NULL, s_G, r_V); + SumElementStrided1d(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V); + } else if (BASIS_DIM == 2) { + ReadElementStrided2d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U, + r_U); + GradTransposeTensorCollocatedNodes2d(data, r_U, NULL, s_G, r_V); + SumElementStrided2d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V); + } else if (BASIS_DIM == 3) { + ReadElementStrided3d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem, + BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U); + GradTransposeTensorCollocatedNodes3d(data, r_U, NULL, s_G, r_V); + SumElementStrided3d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, + BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V); + } + } +#endif } //------------------------------------------------------------------------------ @@ -193,21 +726,36 @@ extern "C" __launch_bounds__(BASIS_WEIGHT_BLOCK_SIZE) __global__ data.t_id_y = threadIdx.y; data.t_id_z = threadIdx.z; data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; - data.slice = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1); + data.slice = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1); CeedScalar r_W[BASIS_DIM > 2 ? BASIS_Q_1D : 1]; +#ifdef __HIP_PLATFORM_SPIRV__ + CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; + if (BASIS_DIM == 1) { + Weight1d(data, q_weight_1d, r_W); + if (elem < num_elem) WriteElementStrided1d<1, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, r_W, d_W); + } else if (BASIS_DIM == 2) { + WeightTensor2d(data, q_weight_1d, r_W); + if (elem < num_elem) WriteElementStrided2d<1, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, r_W, d_W); + } else if (BASIS_DIM == 3) { + WeightTensor3d(data, q_weight_1d, r_W); + if (elem < num_elem) WriteElementStrided3d<1, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, r_W, + d_W); + } +#else for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { if (BASIS_DIM == 1) { - Weight1d(data, q_weight_1d, r_W); + Weight1d(data, q_weight_1d, r_W); WriteElementStrided1d<1, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, r_W, d_W); } else if (BASIS_DIM == 2) { - WeightTensor2d(data, q_weight_1d, r_W); + WeightTensor2d(data, q_weight_1d, r_W); WriteElementStrided2d<1, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, r_W, d_W); } else if (BASIS_DIM == 3) { - WeightTensor3d(data, q_weight_1d, r_W); + WeightTensor3d(data, q_weight_1d, r_W); WriteElementStrided3d<1, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, r_W, d_W); } } +#endif } diff --git a/include/ceed/jit-source/hip/hip-types.h b/include/ceed/jit-source/hip/hip-types.h index 0042199c8b..ebe689c094 100644 --- a/include/ceed/jit-source/hip/hip-types.h +++ b/include/ceed/jit-source/hip/hip-types.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -7,8 +7,7 @@ /// @file /// Internal header for HIP type definitions -#ifndef CEED_HIP_TYPES_H -#define CEED_HIP_TYPES_H +#pragma once #include @@ -24,6 +23,13 @@ typedef struct { CeedInt *outputs[CEED_HIP_NUMBER_FIELDS]; } FieldsInt_Hip; +typedef struct { + CeedInt num_elem; + const CeedInt *num_per_elem; + const CeedInt *indices; + const CeedScalar *coords; +} Points_Hip; + typedef struct { CeedInt t_id_x; CeedInt t_id_y; @@ -31,5 +37,3 @@ typedef struct { CeedInt t_id; CeedScalar *slice; } SharedData_Hip; - -#endif // CEED_HIP_TYPES_H diff --git a/include/ceed/jit-source/magma/magma-basis-grad-1d.h b/include/ceed/jit-source/magma/magma-basis-grad-1d.h index dd21682225..ed2aceb69a 100644 --- a/include/ceed/jit-source/magma/magma-basis-grad-1d.h +++ b/include/ceed/jit-source/magma/magma-basis-grad-1d.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -7,7 +7,6 @@ /// @file /// Internal header for MAGMA tensor basis gradient in 1D - #include "magma-common-tensor.h" // macros to abstract access of shared memory and reg. file @@ -126,3 +125,48 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_ // write V write_1d(sV, dV, cstrdV, tx); } + +//////////////////////////////////////////////////////////////////////////////// +extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_1D)) __global__ + void magma_gradta_1d_kernel(const CeedScalar *dTinterp, const CeedScalar *dTgrad, const CeedScalar *dU, const int estrdU, const int cstrdU, + const int dstrdU, CeedScalar *dV, const int estrdV, const int cstrdV, const int dstrdV, const int nelem) { + MAGMA_DEVICE_SHARED(CeedScalar, shared_data) + + const int tx = threadIdx.x; + const int ty = threadIdx.y; + const int elem_id = (blockIdx.x * blockDim.y) + ty; + + if (elem_id >= nelem) return; + + CeedScalar *sU[BASIS_NUM_COMP]; + CeedScalar *sV[BASIS_NUM_COMP]; + + // shift global memory pointers by elem stride + dU += elem_id * estrdU; + dV += elem_id * estrdV; + + // assign shared memory pointers + CeedScalar *sT = (CeedScalar *)shared_data; + CeedScalar *sW = sT + BASIS_Q * BASIS_P; + sU[0] = sW + ty * BASIS_NUM_COMP * (BASIS_Q + BASIS_P); + sV[0] = sU[0] + (BASIS_NUM_COMP * 1 * BASIS_Q); + for (int comp = 1; comp < BASIS_NUM_COMP; comp++) { + sU[comp] = sU[comp - 1] + (1 * BASIS_Q); + sV[comp] = sV[comp - 1] + (1 * BASIS_P); + } + + // read T + if (ty == 0) { + read_T_trans_gm2sm(tx, dTgrad, sT); + } + + // read U + read_1d(dU, cstrdU, sU, tx); + + __syncthreads(); + magma_grad_1d_device(sT, sU, sV, tx); + __syncthreads(); + + // sum into V + sum_1d(sV, dV, cstrdV, tx); +} diff --git a/include/ceed/jit-source/magma/magma-basis-grad-2d.h b/include/ceed/jit-source/magma/magma-basis-grad-2d.h index 23559716dc..9fda73c657 100644 --- a/include/ceed/jit-source/magma/magma-basis-grad-2d.h +++ b/include/ceed/jit-source/magma/magma-basis-grad-2d.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -7,7 +7,6 @@ /// @file /// Internal header for MAGMA tensor basis gradient in 2D - #include "magma-common-tensor.h" // macros to abstract access of shared memory and reg. file @@ -188,3 +187,54 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_ // write V write_V_2d(dV + (0 * dstrdV), cstrdV, rV, tx); } + +//////////////////////////////////////////////////////////////////////////////// +extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_2D)) __global__ + void magma_gradta_2d_kernel(const CeedScalar *dinterp1d, const CeedScalar *dgrad1d, const CeedScalar *dU, const int estrdU, const int cstrdU, + const int dstrdU, CeedScalar *dV, const int estrdV, const int cstrdV, const int dstrdV, const int nelem) { + MAGMA_DEVICE_SHARED(CeedScalar, shared_data) + + const int tx = threadIdx.x; + const int ty = threadIdx.y; + const int elem_id = (blockIdx.x * blockDim.y) + ty; + + if (elem_id >= nelem) return; + + CeedScalar rU[1][BASIS_NUM_COMP][BASIS_Q] = {0.0}; // here DIM_U = 1, but might be different for a fused operator + CeedScalar rV[1][BASIS_NUM_COMP][BASIS_P] = {0.0}; // here DIM_V = 1, but might be different for a fused operator + CeedScalar rTmp = 0.0; + + // shift global memory pointers by elem stride + dU += elem_id * estrdU; + dV += elem_id * estrdV; + + // assign shared memory pointers + CeedScalar *sTinterp = (CeedScalar *)shared_data; + CeedScalar *sTgrad = sTinterp + BASIS_Q * BASIS_P; + CeedScalar *sTmp = sTgrad + BASIS_Q * BASIS_P; + sTmp += ty * (BASIS_Q * BASIS_MAX_P_Q); + + // read T + if (ty == 0) { + read_T_trans_gm2sm(tx, dinterp1d, sTinterp); + read_T_trans_gm2sm(tx, dgrad1d, sTgrad); + } + __syncthreads(); + + /* read U (idim = 0 for dU, i_DIM = 0 for rU) -- + there is a sync at the end of this function */ + read_U_2d(dU + (0 * dstrdU), cstrdU, rU, sTmp, tx); + /* first call (i_DIM = 0, i_DIM_U = 0, i_DIM_V = 0) */ + magma_grad_2d_device(sTinterp, sTgrad, rU, rV, tx, rTmp, sTmp); + /* there is a sync at the end of magma_grad_2d_device */ + + /* read U (idim = 1 for dU, i_DIM = 0 for rU) -- + there is a sync at the end of this function */ + read_U_2d(dU + (1 * dstrdU), cstrdU, rU, sTmp, tx); + /* second call (i_DIM = 1, i_DIM_U = 0, i_DIM_V = 0) */ + magma_grad_2d_device(sTinterp, sTgrad, rU, rV, tx, rTmp, sTmp); + /* there is a sync at the end of magma_grad_2d_device */ + + // sum into V + sum_V_2d(dV + (0 * dstrdV), cstrdV, rV, tx); +} diff --git a/include/ceed/jit-source/magma/magma-basis-grad-3d.h b/include/ceed/jit-source/magma/magma-basis-grad-3d.h index c030f8e9e5..4b835216f2 100644 --- a/include/ceed/jit-source/magma/magma-basis-grad-3d.h +++ b/include/ceed/jit-source/magma/magma-basis-grad-3d.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -7,7 +7,6 @@ /// @file /// Internal header for MAGMA tensor basis gradient in 3D - #include "magma-common-tensor.h" // macros to abstract access of shared memory and reg. file @@ -225,3 +224,61 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q *BASIS_MAX_P_Q, MA // write V write_V_3d(dV + (0 * dstrdV), cstrdV, rV, tx); } + +//////////////////////////////////////////////////////////////////////////////// +extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q *BASIS_MAX_P_Q, MAGMA_MAXTHREADS_3D)) __global__ + void magma_gradta_3d_kernel(const CeedScalar *dinterp1d, const CeedScalar *dgrad1d, const CeedScalar *dU, const int estrdU, const int cstrdU, + const int dstrdU, CeedScalar *dV, const int estrdV, const int cstrdV, const int dstrdV, const int nelem) { + MAGMA_DEVICE_SHARED(CeedScalar, shared_data) + + const int tx = threadIdx.x; + const int ty = threadIdx.y; + const int elem_id = (blockIdx.x * blockDim.y) + ty; + + if (elem_id >= nelem) return; + + CeedScalar rU[1][BASIS_NUM_COMP][BASIS_Q] = {0.0}; // here DIM_U = 1, but might be different for a fused operator + CeedScalar rV[1][BASIS_NUM_COMP][BASIS_P] = {0.0}; // here DIM_V = 1, but might be different for a fused operator + CeedScalar rTmp = 0.0; + + // shift global memory pointers by elem stride + dU += elem_id * estrdU; + dV += elem_id * estrdV; + + // assign shared memory pointers + CeedScalar *sTinterp = (CeedScalar *)shared_data; + CeedScalar *sTgrad = sTinterp + BASIS_Q * BASIS_P; + CeedScalar *sTmp = sTgrad + BASIS_Q * BASIS_P; + sTmp += ty * (max(BASIS_Q * BASIS_Q * BASIS_Q, (BASIS_Q * BASIS_Q * BASIS_P) + (BASIS_Q * BASIS_P * BASIS_P))); + + // read T + if (ty == 0) { + read_T_trans_gm2sm(tx, dinterp1d, sTinterp); + read_T_trans_gm2sm(tx, dgrad1d, sTgrad); + } + __syncthreads(); + + /* read U (idim = 0 for dU, i_DIM = 0 for rU) -- + there is a sync at the end of this function */ + read_U_3d(dU + (0 * dstrdU), cstrdU, rU, sTmp, tx); + /* then first call (i_DIM = 0, i_DIM_U = 0, i_DIM_V = 0) */ + magma_grad_3d_device(sTinterp, sTgrad, rU, rV, tx, rTmp, sTmp); + /* there is a sync at the end of magma_grad_3d_device */ + + /* read U (idim = 1 for dU, i_DIM = 0 for rU) -- + there is a sync at the end of this function */ + read_U_3d(dU + (1 * dstrdU), cstrdU, rU, sTmp, tx); + /* then second call (i_DIM = 1, i_DIM_U = 0, i_DIM_V = 0) */ + magma_grad_3d_device(sTinterp, sTgrad, rU, rV, tx, rTmp, sTmp); + /* there is a sync at the end of magma_grad_3d_device */ + + /* read U (idim = 2 for dU, i_DIM = 0 for rU) -- + there is a sync at the end of this function */ + read_U_3d(dU + (2 * dstrdU), cstrdU, rU, sTmp, tx); + /* then third call (i_DIM = 2, i_DIM_U = 0, i_DIM_V = 0) */ + magma_grad_3d_device(sTinterp, sTgrad, rU, rV, tx, rTmp, sTmp); + /* there is a sync at the end of magma_grad_3d_device */ + + // sum into V + sum_V_3d(dV + (0 * dstrdV), cstrdV, rV, tx); +} diff --git a/include/ceed/jit-source/magma/magma-basis-interp-1d.h b/include/ceed/jit-source/magma/magma-basis-interp-1d.h index ae8d082653..531b9273e2 100644 --- a/include/ceed/jit-source/magma/magma-basis-interp-1d.h +++ b/include/ceed/jit-source/magma/magma-basis-interp-1d.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -7,7 +7,6 @@ /// @file /// Internal header for MAGMA tensor basis interpolation in 1D - #include "magma-common-tensor.h" // macros to abstract access of shared memory and reg. file @@ -126,3 +125,48 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_ // write V write_1d(sV, dV, cstrdV, tx); } + +//////////////////////////////////////////////////////////////////////////////// +extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_1D)) __global__ + void magma_interpta_1d_kernel(const CeedScalar *dT, const CeedScalar *dU, const int estrdU, const int cstrdU, CeedScalar *dV, const int estrdV, + const int cstrdV, const int nelem) { + MAGMA_DEVICE_SHARED(CeedScalar, shared_data) + + const int tx = threadIdx.x; + const int ty = threadIdx.y; + const int elem_id = (blockIdx.x * blockDim.y) + ty; + + if (elem_id >= nelem) return; + + CeedScalar *sU[BASIS_NUM_COMP]; + CeedScalar *sV[BASIS_NUM_COMP]; + + // shift global memory pointers by elem stride + dU += elem_id * estrdU; + dV += elem_id * estrdV; + + // assign shared memory pointers + CeedScalar *sT = (CeedScalar *)shared_data; + CeedScalar *sW = sT + BASIS_Q * BASIS_P; + sU[0] = sW + ty * BASIS_NUM_COMP * (BASIS_Q + BASIS_P); + sV[0] = sU[0] + (BASIS_NUM_COMP * 1 * BASIS_Q); + for (int comp = 1; comp < BASIS_NUM_COMP; comp++) { + sU[comp] = sU[comp - 1] + (1 * BASIS_Q); + sV[comp] = sV[comp - 1] + (1 * BASIS_P); + } + + // read T + if (ty == 0) { + read_T_trans_gm2sm(tx, dT, sT); + } + + // read U + read_1d(dU, cstrdU, sU, tx); + + __syncthreads(); + magma_interp_1d_device(sT, sU, sV, tx); + __syncthreads(); + + // sum into V + sum_1d(sV, dV, cstrdV, tx); +} diff --git a/include/ceed/jit-source/magma/magma-basis-interp-2d.h b/include/ceed/jit-source/magma/magma-basis-interp-2d.h index a2a41a25ae..04640fe75b 100644 --- a/include/ceed/jit-source/magma/magma-basis-interp-2d.h +++ b/include/ceed/jit-source/magma/magma-basis-interp-2d.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -7,7 +7,6 @@ /// @file /// Internal header for MAGMA tensor basis interpolation in 1D - #include "magma-common-tensor.h" // macros to abstract access of shared memory and reg. file @@ -144,3 +143,44 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_ // write V write_V_2d(dV, cstrdV, rV, tx); } + +//////////////////////////////////////////////////////////////////////////////// +extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_2D)) __global__ + void magma_interpta_2d_kernel(const CeedScalar *dT, const CeedScalar *dU, const int estrdU, const int cstrdU, CeedScalar *dV, const int estrdV, + const int cstrdV, const int nelem) { + MAGMA_DEVICE_SHARED(CeedScalar, shared_data) + + const int tx = threadIdx.x; + const int ty = threadIdx.y; + const int elem_id = (blockIdx.x * blockDim.y) + ty; + + if (elem_id >= nelem) return; + + CeedScalar rU[1][BASIS_NUM_COMP][BASIS_Q] = {0.0}; // for a non-fused operator BASIS_DIM is always 1 + CeedScalar rV[1][BASIS_NUM_COMP][BASIS_P] = {0.0}; // for a non-fused operator BASIS_DIM is always 1 + CeedScalar rTmp = 0.0; + + // shift global memory pointers by elem stride + dU += elem_id * estrdU; + dV += elem_id * estrdV; + + // assign shared memory pointers + CeedScalar *sT = (CeedScalar *)shared_data; + CeedScalar *sTmp = sT + BASIS_Q * BASIS_P; + sTmp += ty * (BASIS_Q * BASIS_MAX_P_Q); + + // read T + if (ty == 0) { + read_T_trans_gm2sm(tx, dT, sT); + } + + // read U -- there is a sync at the end of this function + read_U_2d(dU, cstrdU, rU, sTmp, tx); + + // no sync needed here -- read_U_2d already syncs at the end + magma_interp_2d_device(sT, rU, rV, tx, rTmp, sTmp); + __syncthreads(); + + // sum into V + sum_V_2d(dV, cstrdV, rV, tx); +} diff --git a/include/ceed/jit-source/magma/magma-basis-interp-3d.h b/include/ceed/jit-source/magma/magma-basis-interp-3d.h index 50c7e4df4a..004071ee32 100644 --- a/include/ceed/jit-source/magma/magma-basis-interp-3d.h +++ b/include/ceed/jit-source/magma/magma-basis-interp-3d.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -7,7 +7,6 @@ /// @file /// Internal header for MAGMA tensor basis interpolation in 3D - #include "magma-common-tensor.h" // macros to abstract access of shared memory and reg. file @@ -172,3 +171,44 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q *BASIS_MAX_P_Q, MA // write V write_V_3d(dV, cstrdV, rV, tx); } + +//////////////////////////////////////////////////////////////////////////////// +extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q *BASIS_MAX_P_Q, MAGMA_MAXTHREADS_3D)) __global__ + void magma_interpta_3d_kernel(const CeedScalar *dT, const CeedScalar *dU, const int estrdU, const int cstrdU, CeedScalar *dV, const int estrdV, + const int cstrdV, const int nelem) { + MAGMA_DEVICE_SHARED(CeedScalar, shared_data) + + const int tx = threadIdx.x; + const int ty = threadIdx.y; + const int elem_id = (blockIdx.x * blockDim.y) + ty; + + if (elem_id >= nelem) return; + + CeedScalar rU[1][BASIS_NUM_COMP][BASIS_Q] = {0.0}; // for a non-fused operator BASIS_DIM is always 1 + CeedScalar rV[1][BASIS_NUM_COMP][BASIS_P] = {0.0}; // for a non-fused operator BASIS_DIM is always 1 + CeedScalar rTmp[BASIS_P] = {0.0}; + + // shift global memory pointers by elem stride + dU += elem_id * estrdU; + dV += elem_id * estrdV; + + // assign shared memory pointers + CeedScalar *sT = (CeedScalar *)shared_data; + CeedScalar *sTmp = sT + BASIS_Q * BASIS_P; + sTmp += ty * (max(BASIS_Q * BASIS_Q * BASIS_MAX_P_Q, BASIS_Q * BASIS_P * BASIS_P)); + + // read T + if (ty == 0) { + read_T_trans_gm2sm(tx, dT, sT); + } + + // read U (idim = 0 for dU, i_DIM = 0 for rU, u_dimstride is always 0) + read_U_3d(dU, cstrdU, rU, sTmp, tx); + // there is a sync at the end of this function + + magma_interp_3d_device(sT, rU, rV, tx, rTmp, sTmp); + __syncthreads(); + + // sum into V + sum_V_3d(dV, cstrdV, rV, tx); +} diff --git a/include/ceed/jit-source/magma/magma-basis-interp-deriv-nontensor.h b/include/ceed/jit-source/magma/magma-basis-interp-deriv-nontensor.h index f5e2df1e90..15f2b90ce6 100644 --- a/include/ceed/jit-source/magma/magma-basis-interp-deriv-nontensor.h +++ b/include/ceed/jit-source/magma/magma-basis-interp-deriv-nontensor.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -7,7 +7,6 @@ /// @file /// Internal header for MAGMA non-tensor basis interpolation - #include "magma-common-nontensor.h" //////////////////////////////////////////////////////////////////////////////// @@ -99,6 +98,52 @@ static __device__ __inline__ void magma_basis_nontensor_device_t(const int n, Ce } } +//////////////////////////////////////////////////////////////////////////////// +template +static __device__ __inline__ void magma_basis_nontensor_device_ta(const int n, const CeedScalar *dA, const CeedScalar *dB, CeedScalar *dC, + CeedScalar *shared_data) { + const int tx = threadIdx.x; + const int ty = threadIdx.y; + const int id = blockIdx.x * blockDim.y + ty; + const int nblocks = (n + NB - 1) / NB; + const int myn = min(NB, n - id * NB); + + dB += id * Q * NB; + dC += id * P * NB; + + // A is P x Q + CeedScalar *sA = shared_data; + CeedScalar *sB = shared_data + ty * Q * NB; + + CeedScalar rC[NB] = {0.0}; + + // unrolling this loop yields dramatic performance drop using hipcc, so let the compiler decide (no pragma unroll) + for (int d = 0; d < Q_COMP; d++) { + // read A using all threads + CeedScalar rA[Q]; + read_A_notrans_g2r_1D_nosync(tx, ty, dA, sA, rA); + __syncthreads(); + + // read B + if (id < nblocks) { + read_B_g2s_1D_nosync(tx, myn, dB, sB); + } + __syncthreads(); + + addmul_rAsBrC_1D_nosync(rA, sB, rC); + + dA += P * Q; + dB += Q * n; + + __syncthreads(); + } + + // sum into C + if (id < nblocks) { + sum_C_r2g_1D_nosync(tx, myn, rC, dC); + } +} + //////////////////////////////////////////////////////////////////////////////// template static __device__ __inline__ void magma_basis_nontensor_device_n1(const int n, CeedScalar const *dA, CeedScalar const *dB, CeedScalar *dC, @@ -171,6 +216,42 @@ static __device__ __inline__ void magma_basis_nontensor_device_t1(const int n, C write_C_r2g_1D_nosync(tx, myn, rC, dC); } +//////////////////////////////////////////////////////////////////////////////// +template +static __device__ __inline__ void magma_basis_nontensor_device_ta1(const int n, CeedScalar const *dA, CeedScalar const *dB, CeedScalar *dC, + CeedScalar *shared_data) { + const int tx = threadIdx.x; + const int ty = threadIdx.y; + const int id = blockIdx.x * blockDim.y + ty; + const int nblocks = (n + NB - 1) / NB; + const int myn = min(NB, n - id * NB); + + dB += id * Q * NB; + dC += id * P * NB; + + // A is P x Q + CeedScalar *sA = shared_data; + CeedScalar *sB = shared_data + ty * Q * NB; + + // read A using all threads + CeedScalar rA[Q]; + read_A_notrans_g2r_1D_nosync(tx, ty, dA, sA, rA); + __syncthreads(); + + // terminate threads with no work + if (id >= nblocks) return; + + // read B + read_B_g2s_1D_nosync(tx, myn, dB, sB); + __syncthreads(); + + CeedScalar rC[NB]; + mul_rAsBrC_1D_nosync(rA, sB, rC); + + // sum into C + sum_C_r2g_1D_nosync(tx, myn, rC, dC); +} + //////////////////////////////////////////////////////////////////////////////// extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_Q, MAGMA_MAXTHREADS_1D)) __global__ void magma_interp_nontensor_n(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) { @@ -195,6 +276,18 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_P, MAGMA_MAXTHREADS_1D)) _ #endif } +//////////////////////////////////////////////////////////////////////////////// +extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_P, MAGMA_MAXTHREADS_1D)) __global__ + void magma_interp_nontensor_ta(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) { + MAGMA_DEVICE_SHARED(CeedScalar, shared_data); + +#if BASIS_Q_COMP_INTERP == 1 + magma_basis_nontensor_device_ta1(n, dA, dB, dC, (CeedScalar *)shared_data); +#else + magma_basis_nontensor_device_ta(n, dA, dB, dC, (CeedScalar *)shared_data); +#endif +} + //////////////////////////////////////////////////////////////////////////////// extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_Q, MAGMA_MAXTHREADS_1D)) __global__ void magma_deriv_nontensor_n(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) { @@ -218,3 +311,15 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_P, MAGMA_MAXTHREADS_1D)) _ magma_basis_nontensor_device_t(n, dA, dB, dC, (CeedScalar *)shared_data); #endif } + +//////////////////////////////////////////////////////////////////////////////// +extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_P, MAGMA_MAXTHREADS_1D)) __global__ + void magma_deriv_nontensor_ta(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) { + MAGMA_DEVICE_SHARED(CeedScalar, shared_data); + +#if BASIS_Q_COMP_DERIV == 1 + magma_basis_nontensor_device_ta1(n, dA, dB, dC, (CeedScalar *)shared_data); +#else + magma_basis_nontensor_device_ta(n, dA, dB, dC, (CeedScalar *)shared_data); +#endif +} diff --git a/include/ceed/jit-source/magma/magma-basis-weight-1d.h b/include/ceed/jit-source/magma/magma-basis-weight-1d.h index 431fbb6d03..d922a7586c 100644 --- a/include/ceed/jit-source/magma/magma-basis-weight-1d.h +++ b/include/ceed/jit-source/magma/magma-basis-weight-1d.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -7,7 +7,6 @@ /// @file /// Internal header for MAGMA tensor basis weight in 1D - #include "magma-common-tensor.h" //////////////////////////////////////////////////////////////////////////////// diff --git a/include/ceed/jit-source/magma/magma-basis-weight-2d.h b/include/ceed/jit-source/magma/magma-basis-weight-2d.h index 034992e8f1..9cbb18baae 100644 --- a/include/ceed/jit-source/magma/magma-basis-weight-2d.h +++ b/include/ceed/jit-source/magma/magma-basis-weight-2d.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -7,7 +7,6 @@ /// @file /// Internal header for MAGMA tensor basis weight in 2D - #include "magma-common-tensor.h" //////////////////////////////////////////////////////////////////////////////// diff --git a/include/ceed/jit-source/magma/magma-basis-weight-3d.h b/include/ceed/jit-source/magma/magma-basis-weight-3d.h index a5ee73bd96..8fc3e96919 100644 --- a/include/ceed/jit-source/magma/magma-basis-weight-3d.h +++ b/include/ceed/jit-source/magma/magma-basis-weight-3d.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -7,7 +7,6 @@ /// @file /// Internal header for MAGMA tensor basis weight in 3D - #include "magma-common-tensor.h" //////////////////////////////////////////////////////////////////////////////// diff --git a/include/ceed/jit-source/magma/magma-basis-weight-nontensor.h b/include/ceed/jit-source/magma/magma-basis-weight-nontensor.h index 6a20ecefd6..51cf97d727 100644 --- a/include/ceed/jit-source/magma/magma-basis-weight-nontensor.h +++ b/include/ceed/jit-source/magma/magma-basis-weight-nontensor.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -7,7 +7,6 @@ /// @file /// Internal header for MAGMA non-tensor basis weight - #include "magma-common-nontensor.h" //////////////////////////////////////////////////////////////////////////////// diff --git a/include/ceed/jit-source/magma/magma-common-defs.h b/include/ceed/jit-source/magma/magma-common-defs.h index a4913c2082..22a1b835cb 100644 --- a/include/ceed/jit-source/magma/magma-common-defs.h +++ b/include/ceed/jit-source/magma/magma-common-defs.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -7,8 +7,7 @@ /// @file /// Internal header for MAGMA backend common definitions -#ifndef CEED_MAGMA_COMMON_DEFS_H -#define CEED_MAGMA_COMMON_DEFS_H +#pragma once #define MAGMA_DEVICE_SHARED(type, name) extern __shared__ type name[]; @@ -21,5 +20,3 @@ // Define macro for computing the total threads in a block for use with __launch_bounds__() #define MAGMA_BASIS_BOUNDS(x, maxt) (x * MAGMA_BASIS_NTCOL(x, maxt)) - -#endif // CEED_MAGMA_COMMON_DEFS_H diff --git a/include/ceed/jit-source/magma/magma-common-nontensor.h b/include/ceed/jit-source/magma/magma-common-nontensor.h index 730acc6419..8f33484295 100644 --- a/include/ceed/jit-source/magma/magma-common-nontensor.h +++ b/include/ceed/jit-source/magma/magma-common-nontensor.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -104,6 +104,25 @@ static __device__ __inline__ void write_C_r2g_1D_nosync(const int tx, const int } } +//////////////////////////////////////////////////////////////////////////////// +// sum into C from reg. to global +// C is (P x NB) +// 1D thread config. with (P x 1) threads +// no sync at the end of the function +template +static __device__ __inline__ void sum_C_r2g_1D_nosync(const int tx, const int n, T rC[NB], T *dC) { + if (n != NB) { + for (int i = 0; i < n; i++) { + dC[i * P + tx] += rC[i]; + } + } else { +#pragma unroll + for (int i = 0; i < NB; i++) { + dC[i * P + tx] += rC[i]; + } + } +} + //////////////////////////////////////////////////////////////////////////////// // multiply C = A x B using 1D threads in P x 1 config // A (P x Q) in reg., one row per thread diff --git a/include/ceed/jit-source/magma/magma-common-tensor.h b/include/ceed/jit-source/magma/magma-common-tensor.h index 6c483abd9d..d0ca2f53c0 100644 --- a/include/ceed/jit-source/magma/magma-common-tensor.h +++ b/include/ceed/jit-source/magma/magma-common-tensor.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -36,6 +36,18 @@ static __device__ __inline__ void write_1d(T *sBuffer[NUM_COMP], T *devptr, cons } } +//////////////////////////////////////////////////////////////////////////////// +// sum into V of a 1D element into global memory from sV[][] -- for all components +// the devptr is assumed to point directly to the element +template +static __device__ __inline__ void sum_1d(T *sBuffer[NUM_COMP], T *devptr, const int compstride, const int tx) { + if (tx < LENGTH) { + for (int comp = 0; comp < NUM_COMP; comp++) { + devptr[comp * compstride + tx] += sBuffer[comp][tx]; + } + } +} + //////////////////////////////////////////////////////////////////////////////// // read U of a 2D element into registers rU[][][] -- for all components of a single dim // dU is assumed to be offset by elem-stride and dim-stride @@ -107,6 +119,23 @@ static __device__ __inline__ void write_V_2d(T *dV, const int compstride, T rV[D } } +//////////////////////////////////////////////////////////////////////////////// +// sum into V of a 2D element from registers rV[][][] to global memory -- for all components of a single dim +// dV is assumed to be offset by elem-stride and dim-stride +// register is assumed to be rV[DIM_V][NUM_COMP][rV_SIZE] +// i_DIM specifies which dimension is being written to in dV +// rV_SIZE can be different from P (e.g. max(P, Q)) +template +static __device__ __inline__ void sum_V_2d(T *dV, const int compstride, T rV[DIM_V][NUM_COMP][rV_SIZE], const int tx) { + if (tx < Q) { + for (int comp = 0; comp < NUM_COMP; comp++) { + for (int j = 0; j < Q; j++) { + dV[comp * compstride + j * Q + tx] += rV[i_DIM][comp][j]; + } + } + } +} + //////////////////////////////////////////////////////////////////////////////// // read U of a 3D element into registers rU[][][] -- for all components of a single dim // dU is assumed to be offset by elem-stride and dim-stride @@ -178,6 +207,23 @@ static __device__ __inline__ void write_V_3d(T *dV, const int compstride, T rV[D } } +//////////////////////////////////////////////////////////////////////////////// +// sum into V of a 3D element from registers rV[][][] to global memory -- for all components of a single dim +// dV is assumed to point directly to the element (i.e. already offset by elem-stride) +// register is assumed to be rV[DIM_V][NUM_COMP][rV_SIZE] +// i_DIM specifies which dimension is being written to in dV +// rV_SIZE can be different from P (e.g. max(P, Q)) +template +static __device__ __inline__ void sum_V_3d(T *dV, const int compstride, T rV[DIM_V][NUM_COMP][rV_SIZE], const int tx) { + if (tx < (Q * Q)) { + for (int comp = 0; comp < NUM_COMP; comp++) { + for (int j = 0; j < Q; j++) { + dV[comp * compstride + j * (Q * Q) + tx] += rV[i_DIM][comp][j]; + } + } + } +} + //////////////////////////////////////////////////////////////////////////////// // reads T (no-trans) into shared memory // T is B x J diff --git a/include/ceed/jit-source/sycl/sycl-gen-templates.h b/include/ceed/jit-source/sycl/sycl-gen-templates.h index aa54232c2d..5dada5b9eb 100644 --- a/include/ceed/jit-source/sycl/sycl-gen-templates.h +++ b/include/ceed/jit-source/sycl/sycl-gen-templates.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -7,7 +7,7 @@ /// @file /// Internal header for SYCL backend macro and type definitions for JiT source -#include +#include #pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable #pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable diff --git a/include/ceed/jit-source/sycl/sycl-jit.h b/include/ceed/jit-source/sycl/sycl-jit.h index f4824d8a34..25837f5701 100644 --- a/include/ceed/jit-source/sycl/sycl-jit.h +++ b/include/ceed/jit-source/sycl/sycl-jit.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause diff --git a/include/ceed/jit-source/sycl/sycl-ref-qfunction.h b/include/ceed/jit-source/sycl/sycl-ref-qfunction.h index d62de2533a..9f5df69e68 100644 --- a/include/ceed/jit-source/sycl/sycl-ref-qfunction.h +++ b/include/ceed/jit-source/sycl/sycl-ref-qfunction.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -7,8 +7,7 @@ /// @file /// Internal header for SYCL backend QFunction read/write kernels - -#include +#include //------------------------------------------------------------------------------ // Read from quadrature points diff --git a/include/ceed/jit-source/sycl/sycl-shared-basis-read-write-templates.h b/include/ceed/jit-source/sycl/sycl-shared-basis-read-write-templates.h index 421875b509..551789e48b 100644 --- a/include/ceed/jit-source/sycl/sycl-shared-basis-read-write-templates.h +++ b/include/ceed/jit-source/sycl/sycl-shared-basis-read-write-templates.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -7,9 +7,7 @@ /// @file /// Internal header for SYCL shared memory basis read/write templates - -#include -#include "sycl-types.h" +#include //------------------------------------------------------------------------------ // Helper function: load matrices for basis actions diff --git a/include/ceed/jit-source/sycl/sycl-shared-basis-tensor-templates.h b/include/ceed/jit-source/sycl/sycl-shared-basis-tensor-templates.h index 28bd24d9f9..f023b77d6b 100644 --- a/include/ceed/jit-source/sycl/sycl-shared-basis-tensor-templates.h +++ b/include/ceed/jit-source/sycl/sycl-shared-basis-tensor-templates.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -7,8 +7,7 @@ /// @file /// Internal header for SYCL shared memory tensor product basis templates - -#include +#include //------------------------------------------------------------------------------ // 1D diff --git a/include/ceed/jit-source/sycl/sycl-shared-basis-tensor.h b/include/ceed/jit-source/sycl/sycl-shared-basis-tensor.h index f8e4ccdc0a..71f60cce8b 100644 --- a/include/ceed/jit-source/sycl/sycl-shared-basis-tensor.h +++ b/include/ceed/jit-source/sycl/sycl-shared-basis-tensor.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -7,8 +7,7 @@ /// @file /// Internal header for SYCL shared memory tensor product basis - -#include +#include #include "sycl-shared-basis-read-write-templates.h" #include "sycl-shared-basis-tensor-templates.h" diff --git a/include/ceed/jit-source/sycl/sycl-types.h b/include/ceed/jit-source/sycl/sycl-types.h index 58938a4b2a..5133c6eee8 100644 --- a/include/ceed/jit-source/sycl/sycl-types.h +++ b/include/ceed/jit-source/sycl/sycl-types.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause diff --git a/include/ceed/jit-tools.h b/include/ceed/jit-tools.h index 60e0795f50..c82a9ad075 100644 --- a/include/ceed/jit-tools.h +++ b/include/ceed/jit-tools.h @@ -1,4 +1,4 @@ -/// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +/// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. /// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. /// /// SPDX-License-Identifier: BSD-2-Clause diff --git a/include/ceed/types.h b/include/ceed/types.h index 6817a73322..c687c218f9 100644 --- a/include/ceed/types.h +++ b/include/ceed/types.h @@ -1,4 +1,4 @@ -/// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +/// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. /// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. /// /// SPDX-License-Identifier: BSD-2-Clause @@ -10,8 +10,10 @@ #ifndef CEED_QFUNCTION_DEFS_H #define CEED_QFUNCTION_DEFS_H +#ifndef CEED_RUNNING_JIT_PASS #include #include +#endif /** @ingroup CeedQFunction @@ -23,7 +25,7 @@ #ifndef __NO_INLINE__ #if defined(__GNUC__) || defined(__clang__) #define CEED_QFUNCTION_ATTR __attribute__((flatten)) -#elif defined(__INTEL_COMPILER) +#elif defined(__INTEL_COMPILER) || defined(__INTEL_LLVM_COMPILER) #define CEED_QFUNCTION_ATTR _Pragma("forceinline") #else #define CEED_QFUNCTION_ATTR @@ -49,6 +51,33 @@ backends. It also creates a variable `name_loc` populated with the correct sourc CEED_QFUNCTION_ATTR static int name #endif +/** + @ingroup CeedQFunction +This macro populates the correct function for Rust-based User QFunction source for code generation backends or populates default values for CPU backends. It also creates a variable `name_loc` populated with the correct source path for creating the respective User QFunction. Note that the function, as named in rust, must be called `name_rs`. When referencing it in C, use just `name` (no `_rs`) +Example: +//ex1-volume.h +CEED_QFUNCTION_RUST(build_mass) +//ex1-volume.c +CeedAddRustSourceRoot(ceed, "examples/ceed/ex1-volume-rs"); +// ex1-volume-rs/src/lib.rs +#[no_mangle] +pub unsafe extern "C" fn build_mass_rs( + ctx: *mut c_void, + Q: i32, + in: *const *const f64, + out: *mut *mut f64, +) -> i8 +**/ +#ifndef CEED_QFUNCTION_RUST +#define CEED_QFUNCTION_RUST(name) \ + CEED_QFUNCTION_ATTR int name##_rs(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out); \ + CEED_QFUNCTION_ATTR static int name(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { \ + return name##_rs(ctx, Q, in, out); \ + } \ + static const char name##_loc[] = __FILE__ ":" #name; +#endif +// Note: placing the _loc of the function below the function in the macro is required because python cffi will exclude the previous line (the }) based on the backslash at the end of it, which is required for our python build script to exclude macros. See /python/build_ceed_cffi.py for more details + /** @ingroup CeedQFunction This macro populates the correct function annotations for User QFunction helper function source for code generation backends or populates default @@ -74,7 +103,7 @@ values for CPU backends. Code generation backends may redefine this macro, as needed. **/ #ifndef CeedPragmaSIMD -#if defined(__INTEL_COMPILER) +#if defined(__INTEL_COMPILER) || defined(__INTEL_LLVM_COMPILER) #define CeedPragmaSIMD _Pragma("vector") /// Cannot use Intel pragma ivdep because it miscompiles unpacking symmetric tensors, as in Poisson2DApply, where the SIMD loop body contains /// temporaries such as the following. diff --git a/interface/ceed-basis.c b/interface/ceed-basis.c index 5618a087ab..23bf05c419 100644 --- a/interface/ceed-basis.c +++ b/interface/ceed-basis.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -153,29 +153,59 @@ static int CeedGivensRotation(CeedScalar *A, CeedScalar c, CeedScalar s, CeedTra @param[in] m Number of rows in array @param[in] n Number of columns in array @param[in] a Array to be viewed + @param[in] tabs Tabs to append before each new line @param[in] stream Stream to view to, e.g., `stdout` @return An error code: 0 - success, otherwise - failure @ref Developer **/ -static int CeedScalarView(const char *name, const char *fp_fmt, CeedInt m, CeedInt n, const CeedScalar *a, FILE *stream) { +static int CeedScalarView(const char *name, const char *fp_fmt, CeedInt m, CeedInt n, const CeedScalar *a, const char *tabs, FILE *stream) { if (m > 1) { - fprintf(stream, " %s:\n", name); + fprintf(stream, "%s %s:\n", tabs, name); } else { char padded_name[12]; snprintf(padded_name, 11, "%s:", name); - fprintf(stream, " %-10s", padded_name); + fprintf(stream, "%s %-10s", tabs, padded_name); } for (CeedInt i = 0; i < m; i++) { - if (m > 1) fprintf(stream, " [%" CeedInt_FMT "]", i); + if (m > 1) fprintf(stream, "%s [%" CeedInt_FMT "]", tabs, i); for (CeedInt j = 0; j < n; j++) fprintf(stream, fp_fmt, fabs(a[i * n + j]) > 1E-14 ? a[i * n + j] : 0); fputs("\n", stream); } return CEED_ERROR_SUCCESS; } +/** + @brief View a `CeedBasis` passed as a `CeedObject` + + @param[in] basis `CeedBasis` to view + @param[in] stream Filestream to write to + + @return An error code: 0 - success, otherwise - failure + + @ref Developer +**/ +static int CeedBasisView_Object(CeedObject basis, FILE *stream) { + CeedCall(CeedBasisView((CeedBasis)basis, stream)); + return CEED_ERROR_SUCCESS; +} + +/** + @brief Destroy a `CeedBasis` passed as a `CeedObject` + + @param[in,out] basis Address of `CeedBasis` to destroy + + @return An error code: 0 - success, otherwise - failure + + @ref Developer +**/ +static int CeedBasisDestroy_Object(CeedObject *basis) { + CeedCall(CeedBasisDestroy((CeedBasis *)basis)); + return CEED_ERROR_SUCCESS; +} + /** @brief Create the interpolation and gradient matrices for projection from the nodes of `basis_from` to the nodes of `basis_to`. @@ -194,23 +224,27 @@ static int CeedScalarView(const char *name, const char *fp_fmt, CeedInt m, CeedI @ref Developer **/ static int CeedBasisCreateProjectionMatrices(CeedBasis basis_from, CeedBasis basis_to, CeedScalar **interp_project, CeedScalar **grad_project) { - Ceed ceed; - bool is_tensor_to, is_tensor_from; + bool are_both_tensor; CeedInt Q, Q_to, Q_from, P_to, P_from; - CeedCall(CeedBasisGetCeed(basis_to, &ceed)); - // Check for compatible quadrature spaces CeedCall(CeedBasisGetNumQuadraturePoints(basis_to, &Q_to)); CeedCall(CeedBasisGetNumQuadraturePoints(basis_from, &Q_from)); - CeedCheck(Q_to == Q_from, ceed, CEED_ERROR_DIMENSION, "Bases must have compatible quadrature spaces"); + CeedCheck(Q_to == Q_from, CeedBasisReturnCeed(basis_to), CEED_ERROR_DIMENSION, + "Bases must have compatible quadrature spaces." + " 'basis_from' has %" CeedInt_FMT " points and 'basis_to' has %" CeedInt_FMT, + Q_from, Q_to); Q = Q_to; // Check for matching tensor or non-tensor - CeedCall(CeedBasisIsTensor(basis_to, &is_tensor_to)); - CeedCall(CeedBasisIsTensor(basis_from, &is_tensor_from)); - CeedCheck(is_tensor_to == is_tensor_from, ceed, CEED_ERROR_MINOR, "Bases must both be tensor or non-tensor"); - if (is_tensor_to) { + { + bool is_tensor_to, is_tensor_from; + + CeedCall(CeedBasisIsTensor(basis_to, &is_tensor_to)); + CeedCall(CeedBasisIsTensor(basis_from, &is_tensor_from)); + are_both_tensor = is_tensor_to && is_tensor_from; + } + if (are_both_tensor) { CeedCall(CeedBasisGetNumNodes1D(basis_to, &P_to)); CeedCall(CeedBasisGetNumNodes1D(basis_from, &P_from)); CeedCall(CeedBasisGetNumQuadraturePoints1D(basis_from, &Q)); @@ -221,17 +255,21 @@ static int CeedBasisCreateProjectionMatrices(CeedBasis basis_from, CeedBasis bas // Check for matching FE space CeedFESpace fe_space_to, fe_space_from; + CeedCall(CeedBasisGetFESpace(basis_to, &fe_space_to)); CeedCall(CeedBasisGetFESpace(basis_from, &fe_space_from)); - CeedCheck(fe_space_to == fe_space_from, ceed, CEED_ERROR_MINOR, "Bases must both be the same FE space type"); + CeedCheck(fe_space_to == fe_space_from, CeedBasisReturnCeed(basis_to), CEED_ERROR_MINOR, + "Bases must both be the same FE space type." + " 'basis_from' is a %s and 'basis_to' is a %s", + CeedFESpaces[fe_space_from], CeedFESpaces[fe_space_to]); // Get source matrices CeedInt dim, q_comp = 1; CeedScalar *interp_to_inv, *interp_from; const CeedScalar *interp_to_source = NULL, *interp_from_source = NULL, *grad_from_source = NULL; - CeedCall(CeedBasisGetDimension(basis_to, &dim)); - if (is_tensor_to) { + CeedCall(CeedBasisGetDimension(basis_from, &dim)); + if (are_both_tensor) { CeedCall(CeedBasisGetInterp1D(basis_to, &interp_to_source)); CeedCall(CeedBasisGetInterp1D(basis_from, &interp_from_source)); } else { @@ -246,19 +284,19 @@ static int CeedBasisCreateProjectionMatrices(CeedBasis basis_from, CeedBasis bas // projection basis will have a gradient operation (allocated even if not H^1 for the // basis construction later on) if (fe_space_to == CEED_FE_SPACE_H1) { - if (is_tensor_to) { + if (are_both_tensor) { CeedCall(CeedBasisGetGrad1D(basis_from, &grad_from_source)); } else { CeedCall(CeedBasisGetGrad(basis_from, &grad_from_source)); } } - CeedCall(CeedCalloc(P_to * P_from * (is_tensor_to ? 1 : dim), grad_project)); + CeedCall(CeedCalloc(P_to * P_from * (are_both_tensor ? 1 : dim), grad_project)); // Compute interp_to^+, pseudoinverse of interp_to CeedCall(CeedCalloc(Q * q_comp * P_to, &interp_to_inv)); - CeedCall(CeedMatrixPseudoinverse(ceed, interp_to_source, Q * q_comp, P_to, interp_to_inv)); + CeedCall(CeedMatrixPseudoinverse(CeedBasisReturnCeed(basis_to), interp_to_source, Q * q_comp, P_to, interp_to_inv)); // Build matrices - CeedInt num_matrices = 1 + (fe_space_to == CEED_FE_SPACE_H1) * (is_tensor_to ? 1 : dim); + CeedInt num_matrices = 1 + (fe_space_to == CEED_FE_SPACE_H1) * (are_both_tensor ? 1 : dim); CeedScalar *input_from[num_matrices], *output_project[num_matrices]; input_from[0] = (CeedScalar *)interp_from_source; @@ -270,7 +308,7 @@ static int CeedBasisCreateProjectionMatrices(CeedBasis basis_from, CeedBasis bas for (CeedInt m = 0; m < num_matrices; m++) { // output_project = interp_to^+ * interp_from memcpy(interp_from, input_from[m], Q * P_from * q_comp * sizeof(input_from[m][0])); - CeedCall(CeedMatrixMatrixMultiply(ceed, interp_to_inv, input_from[m], output_project[m], P_to, P_from, Q * q_comp)); + CeedCall(CeedMatrixMatrixMultiply(CeedBasisReturnCeed(basis_to), interp_to_inv, input_from[m], output_project[m], P_to, P_from, Q * q_comp)); // Round zero to machine precision for (CeedInt i = 0; i < P_to * P_from; i++) { if (fabs(output_project[m][i]) < 10 * CEED_EPSILON) output_project[m][i] = 0.0; @@ -283,161 +321,625 @@ static int CeedBasisCreateProjectionMatrices(CeedBasis basis_from, CeedBasis bas return CEED_ERROR_SUCCESS; } -/// @} - -/// ---------------------------------------------------------------------------- -/// Ceed Backend API -/// ---------------------------------------------------------------------------- -/// @addtogroup CeedBasisBackend -/// @{ - /** - @brief Return collocated gradient matrix + @brief Check input vector dimensions for CeedBasisApply[Add] - @param[in] basis `CeedBasis` - @param[out] collo_grad_1d Row-major (`Q_1d * Q_1d`) matrix expressing derivatives of basis functions at quadrature points + @param[in] basis `CeedBasis` to evaluate + @param[in] num_elem The number of elements to apply the basis evaluation to; + the backend will specify the ordering in @ref CeedElemRestrictionCreate() + @param[in] t_mode @ref CEED_NOTRANSPOSE to evaluate from nodes to quadrature points; + @ref CEED_TRANSPOSE to apply the transpose, mapping from quadrature points to nodes + @param[in] eval_mode @ref CEED_EVAL_NONE to use values directly, + @ref CEED_EVAL_INTERP to use interpolated values, + @ref CEED_EVAL_GRAD to use gradients, + @ref CEED_EVAL_DIV to use divergence, + @ref CEED_EVAL_CURL to use curl, + @ref CEED_EVAL_WEIGHT to use quadrature weights + @param[in] u Input `CeedVector` + @param[out] v Output `CeedVector` @return An error code: 0 - success, otherwise - failure - @ref Backend + @ref Developer **/ -int CeedBasisGetCollocatedGrad(CeedBasis basis, CeedScalar *collo_grad_1d) { - Ceed ceed; - CeedInt P_1d, Q_1d; - CeedScalar *interp_1d_pinv; - const CeedScalar *grad_1d, *interp_1d; - - // Note: This function is for backend use, so all errors are terminal and we do not need to clean up memory on failure. - CeedCall(CeedBasisGetCeed(basis, &ceed)); - CeedCall(CeedBasisGetNumNodes1D(basis, &P_1d)); - CeedCall(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); - - // Compute interp_1d^+, pseudoinverse of interp_1d - CeedCall(CeedCalloc(P_1d * Q_1d, &interp_1d_pinv)); - CeedCall(CeedBasisGetInterp1D(basis, &interp_1d)); - CeedCall(CeedMatrixPseudoinverse(ceed, interp_1d, Q_1d, P_1d, interp_1d_pinv)); - CeedCall(CeedBasisGetGrad1D(basis, &grad_1d)); - CeedCall(CeedMatrixMatrixMultiply(ceed, grad_1d, (const CeedScalar *)interp_1d_pinv, collo_grad_1d, Q_1d, Q_1d, P_1d)); - - CeedCall(CeedFree(&interp_1d_pinv)); - return CEED_ERROR_SUCCESS; -} - -/** - @brief Get tensor status for given `CeedBasis` - - @param[in] basis `CeedBasis` - @param[out] is_tensor Variable to store tensor status +static int CeedBasisApplyCheckDims(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, CeedVector v) { + CeedInt dim, num_comp, q_comp, num_nodes, num_qpts; + CeedSize u_length = 0, v_length; - @return An error code: 0 - success, otherwise - failure + CeedCall(CeedBasisGetDimension(basis, &dim)); + CeedCall(CeedBasisGetNumComponents(basis, &num_comp)); + CeedCall(CeedBasisGetNumQuadratureComponents(basis, eval_mode, &q_comp)); + CeedCall(CeedBasisGetNumNodes(basis, &num_nodes)); + CeedCall(CeedBasisGetNumQuadraturePoints(basis, &num_qpts)); + CeedCall(CeedVectorGetLength(v, &v_length)); + if (u) CeedCall(CeedVectorGetLength(u, &u_length)); - @ref Backend -**/ -int CeedBasisIsTensor(CeedBasis basis, bool *is_tensor) { - *is_tensor = basis->is_tensor_basis; + // Check vector lengths to prevent out of bounds issues + bool has_good_dims = true; + switch (eval_mode) { + case CEED_EVAL_NONE: + case CEED_EVAL_INTERP: + case CEED_EVAL_GRAD: + case CEED_EVAL_DIV: + case CEED_EVAL_CURL: + has_good_dims = ((t_mode == CEED_TRANSPOSE && u_length >= (CeedSize)num_elem * (CeedSize)num_comp * (CeedSize)num_qpts * (CeedSize)q_comp && + v_length >= (CeedSize)num_elem * (CeedSize)num_comp * (CeedSize)num_nodes) || + (t_mode == CEED_NOTRANSPOSE && v_length >= (CeedSize)num_elem * (CeedSize)num_qpts * (CeedSize)num_comp * (CeedSize)q_comp && + u_length >= (CeedSize)num_elem * (CeedSize)num_comp * (CeedSize)num_nodes)); + break; + case CEED_EVAL_WEIGHT: + has_good_dims = v_length >= (CeedSize)num_elem * (CeedSize)num_qpts; + break; + } + CeedCheck(has_good_dims, CeedBasisReturnCeed(basis), CEED_ERROR_DIMENSION, "Input/output vectors too short for basis and evaluation mode"); return CEED_ERROR_SUCCESS; } /** - @brief Get backend data of a `CeedBasis` + @brief Check input vector dimensions for CeedBasisApply[Add]AtPoints - @param[in] basis `CeedBasis` - @param[out] data Variable to store data + @param[in] basis `CeedBasis` to evaluate + @param[in] num_elem The number of elements to apply the basis evaluation to; + the backend will specify the ordering in @ref CeedElemRestrictionCreate() + @param[in] num_points Array of the number of points to apply the basis evaluation to in each element, size `num_elem` + @param[in] t_mode @ref CEED_NOTRANSPOSE to evaluate from nodes to points; + @ref CEED_TRANSPOSE to apply the transpose, mapping from points to nodes + @param[in] eval_mode @ref CEED_EVAL_INTERP to use interpolated values, + @ref CEED_EVAL_GRAD to use gradients, + @ref CEED_EVAL_WEIGHT to use quadrature weights + @param[in] x_ref `CeedVector` holding reference coordinates of each point + @param[in] u Input `CeedVector`, of length `num_nodes * num_comp` for @ref CEED_NOTRANSPOSE + @param[out] v Output `CeedVector`, of length `num_points * num_q_comp` for @ref CEED_NOTRANSPOSE with @ref CEED_EVAL_INTERP @return An error code: 0 - success, otherwise - failure - @ref Backend + @ref Developer **/ -int CeedBasisGetData(CeedBasis basis, void *data) { - *(void **)data = basis->data; - return CEED_ERROR_SUCCESS; -} +static int CeedBasisApplyAtPointsCheckDims(CeedBasis basis, CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, + CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) { + CeedInt dim, num_comp, num_q_comp, num_nodes, P_1d = 1, Q_1d = 1, total_num_points = 0; + CeedSize x_length = 0, u_length = 0, v_length; -/** - @brief Set backend data of a `CeedBasis` + CeedCall(CeedBasisGetDimension(basis, &dim)); + CeedCall(CeedBasisGetNumNodes1D(basis, &P_1d)); + CeedCall(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); + CeedCall(CeedBasisGetNumComponents(basis, &num_comp)); + CeedCall(CeedBasisGetNumQuadratureComponents(basis, eval_mode, &num_q_comp)); + CeedCall(CeedBasisGetNumNodes(basis, &num_nodes)); + CeedCall(CeedVectorGetLength(v, &v_length)); + if (x_ref != CEED_VECTOR_NONE) CeedCall(CeedVectorGetLength(x_ref, &x_length)); + if (u != CEED_VECTOR_NONE) CeedCall(CeedVectorGetLength(u, &u_length)); - @param[in,out] basis `CeedBasis` - @param[in] data Data to set + // Check compatibility coordinates vector + for (CeedInt i = 0; i < num_elem; i++) total_num_points += num_points[i]; + CeedCheck((x_length >= (CeedSize)total_num_points * (CeedSize)dim) || (eval_mode == CEED_EVAL_WEIGHT), CeedBasisReturnCeed(basis), + CEED_ERROR_DIMENSION, + "Length of reference coordinate vector incompatible with basis dimension and number of points." + " Found reference coordinate vector of length %" CeedSize_FMT ", not of length %" CeedSize_FMT ".", + x_length, (CeedSize)total_num_points * (CeedSize)dim); - @return An error code: 0 - success, otherwise - failure + // Check CEED_EVAL_WEIGHT only on CEED_NOTRANSPOSE + CeedCheck(eval_mode != CEED_EVAL_WEIGHT || t_mode == CEED_NOTRANSPOSE, CeedBasisReturnCeed(basis), CEED_ERROR_UNSUPPORTED, + "CEED_EVAL_WEIGHT only supported with CEED_NOTRANSPOSE"); - @ref Backend -**/ -int CeedBasisSetData(CeedBasis basis, void *data) { - basis->data = data; + // Check vector lengths to prevent out of bounds issues + bool has_good_dims = true; + switch (eval_mode) { + case CEED_EVAL_INTERP: + has_good_dims = ((t_mode == CEED_TRANSPOSE && (u_length >= (CeedSize)total_num_points * (CeedSize)num_q_comp || + v_length >= (CeedSize)num_elem * (CeedSize)num_nodes * (CeedSize)num_comp)) || + (t_mode == CEED_NOTRANSPOSE && (v_length >= (CeedSize)total_num_points * (CeedSize)num_q_comp || + u_length >= (CeedSize)num_elem * (CeedSize)num_nodes * (CeedSize)num_comp))); + break; + case CEED_EVAL_GRAD: + has_good_dims = ((t_mode == CEED_TRANSPOSE && (u_length >= (CeedSize)total_num_points * (CeedSize)num_q_comp * (CeedSize)dim || + v_length >= (CeedSize)num_elem * (CeedSize)num_nodes * (CeedSize)num_comp)) || + (t_mode == CEED_NOTRANSPOSE && (v_length >= (CeedSize)total_num_points * (CeedSize)num_q_comp * (CeedSize)dim || + u_length >= (CeedSize)num_elem * (CeedSize)num_nodes * (CeedSize)num_comp))); + break; + case CEED_EVAL_WEIGHT: + has_good_dims = t_mode == CEED_NOTRANSPOSE && (v_length >= total_num_points); + break; + // LCOV_EXCL_START + case CEED_EVAL_NONE: + case CEED_EVAL_DIV: + case CEED_EVAL_CURL: + return CeedError(CeedBasisReturnCeed(basis), CEED_ERROR_UNSUPPORTED, "Evaluation at arbitrary points not supported for %s", + CeedEvalModes[eval_mode]); + // LCOV_EXCL_STOP + } + CeedCheck(has_good_dims, CeedBasisReturnCeed(basis), CEED_ERROR_DIMENSION, "Input/output vectors too short for basis and evaluation mode"); return CEED_ERROR_SUCCESS; } /** - @brief Increment the reference counter for a `CeedBasis` + @brief Default implimentation to apply basis evaluation from nodes to arbitrary points - @param[in,out] basis `CeedBasis` to increment the reference counter + @param[in] basis `CeedBasis` to evaluate + @param[in] apply_add Sum result into target vector or overwrite + @param[in] num_elem The number of elements to apply the basis evaluation to; + the backend will specify the ordering in @ref CeedElemRestrictionCreate() + @param[in] num_points Array of the number of points to apply the basis evaluation to in each element, size `num_elem` + @param[in] t_mode @ref CEED_NOTRANSPOSE to evaluate from nodes to points; + @ref CEED_TRANSPOSE to apply the transpose, mapping from points to nodes + @param[in] eval_mode @ref CEED_EVAL_INTERP to use interpolated values, + @ref CEED_EVAL_GRAD to use gradients, + @ref CEED_EVAL_WEIGHT to use quadrature weights + @param[in] x_ref `CeedVector` holding reference coordinates of each point + @param[in] u Input `CeedVector`, of length `num_nodes * num_comp` for @ref CEED_NOTRANSPOSE + @param[out] v Output `CeedVector`, of length `num_points * num_q_comp` for @ref CEED_NOTRANSPOSE with @ref CEED_EVAL_INTERP @return An error code: 0 - success, otherwise - failure - @ref Backend + @ref Developer **/ -int CeedBasisReference(CeedBasis basis) { - basis->ref_count++; - return CEED_ERROR_SUCCESS; -} +static int CeedBasisApplyAtPoints_Core(CeedBasis basis, bool apply_add, CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, + CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) { + CeedInt dim, num_comp, P_1d = 1, Q_1d = 1, total_num_points = num_points[0]; -/** - @brief Get number of Q-vector components for given `CeedBasis` + CeedCall(CeedBasisGetDimension(basis, &dim)); + // Inserting check because clang-tidy doesn't understand this cannot occur + CeedCheck(dim > 0, CeedBasisReturnCeed(basis), CEED_ERROR_UNSUPPORTED, "Malformed CeedBasis, dim > 0 is required"); + CeedCall(CeedBasisGetNumNodes1D(basis, &P_1d)); + CeedCall(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); + CeedCall(CeedBasisGetNumComponents(basis, &num_comp)); - @param[in] basis `CeedBasis` - @param[in] eval_mode @ref CEED_EVAL_INTERP to use interpolated values, - @ref CEED_EVAL_GRAD to use gradients, - @ref CEED_EVAL_DIV to use divergence, - @ref CEED_EVAL_CURL to use curl - @param[out] q_comp Variable to store number of Q-vector components of basis + // Default implementation + { + bool is_tensor_basis; - @return An error code: 0 - success, otherwise - failure + CeedCall(CeedBasisIsTensor(basis, &is_tensor_basis)); + CeedCheck(is_tensor_basis, CeedBasisReturnCeed(basis), CEED_ERROR_UNSUPPORTED, + "Evaluation at arbitrary points only supported for tensor product bases"); + } + CeedCheck(num_elem == 1, CeedBasisReturnCeed(basis), CEED_ERROR_UNSUPPORTED, + "Evaluation at arbitrary points only supported for a single element at a time"); + if (eval_mode == CEED_EVAL_WEIGHT) { + CeedCall(CeedVectorSetValue(v, 1.0)); + return CEED_ERROR_SUCCESS; + } + if (!basis->basis_chebyshev) { + // Build basis mapping from nodes to Chebyshev coefficients + CeedScalar *chebyshev_interp_1d, *chebyshev_grad_1d, *chebyshev_q_weight_1d; + const CeedScalar *q_ref_1d; + Ceed ceed; - @ref Backend -**/ -int CeedBasisGetNumQuadratureComponents(CeedBasis basis, CeedEvalMode eval_mode, CeedInt *q_comp) { - CeedInt dim; + CeedCall(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d)); + CeedCall(CeedCalloc(P_1d * Q_1d, &chebyshev_grad_1d)); + CeedCall(CeedCalloc(Q_1d, &chebyshev_q_weight_1d)); + CeedCall(CeedBasisGetQRef(basis, &q_ref_1d)); + CeedCall(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d)); - CeedCall(CeedBasisGetDimension(basis, &dim)); - switch (eval_mode) { - case CEED_EVAL_INTERP: { - CeedFESpace fe_space; + CeedCall(CeedBasisGetCeed(basis, &ceed)); + CeedCall(CeedVectorCreate(ceed, num_comp * CeedIntPow(Q_1d, dim), &basis->vec_chebyshev)); + CeedCall(CeedBasisCreateTensorH1(ceed, dim, num_comp, P_1d, Q_1d, chebyshev_interp_1d, chebyshev_grad_1d, q_ref_1d, chebyshev_q_weight_1d, + &basis->basis_chebyshev)); - CeedCall(CeedBasisGetFESpace(basis, &fe_space)); - *q_comp = (fe_space == CEED_FE_SPACE_H1) ? 1 : dim; - } break; - case CEED_EVAL_GRAD: - *q_comp = dim; - break; - case CEED_EVAL_DIV: - *q_comp = 1; - break; - case CEED_EVAL_CURL: - *q_comp = (dim < 3) ? 1 : dim; - break; - case CEED_EVAL_NONE: - case CEED_EVAL_WEIGHT: - *q_comp = 1; - break; + // Cleanup + CeedCall(CeedFree(&chebyshev_interp_1d)); + CeedCall(CeedFree(&chebyshev_grad_1d)); + CeedCall(CeedFree(&chebyshev_q_weight_1d)); + CeedCall(CeedDestroy(&ceed)); } - return CEED_ERROR_SUCCESS; -} -/** - @brief Estimate number of FLOPs required to apply `CeedBasis` in `t_mode` and `eval_mode` + // Create TensorContract object if needed, such as a basis from the GPU backends + if (!basis->contract) { + Ceed ceed_ref; + CeedBasis basis_ref = NULL; - @param[in] basis `CeedBasis` to estimate FLOPs for - @param[in] t_mode Apply basis or transpose - @param[in] eval_mode @ref CeedEvalMode - @param[out] flops Address of variable to hold FLOPs estimate + CeedCall(CeedInit("/cpu/self", &ceed_ref)); + // Only need matching tensor contraction dimensions, any type of basis will work + CeedCall(CeedBasisCreateTensorH1Lagrange(ceed_ref, dim, num_comp, P_1d, Q_1d, CEED_GAUSS, &basis_ref)); + // Note - clang-tidy doesn't know basis_ref->contract must be valid here + CeedCheck(basis_ref && basis_ref->contract, CeedBasisReturnCeed(basis), CEED_ERROR_UNSUPPORTED, + "Reference CPU ceed failed to create a tensor contraction object"); + CeedCall(CeedTensorContractReferenceCopy(basis_ref->contract, &basis->contract)); + CeedCall(CeedBasisDestroy(&basis_ref)); + CeedCall(CeedDestroy(&ceed_ref)); + } - @ref Backend -**/ -int CeedBasisGetFlopsEstimate(CeedBasis basis, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedSize *flops) { - bool is_tensor; + // Basis evaluation + switch (t_mode) { + case CEED_NOTRANSPOSE: { + // Nodes to arbitrary points + CeedScalar *v_array; + const CeedScalar *chebyshev_coeffs, *x_array_read; + + // -- Interpolate to Chebyshev coefficients + CeedCall(CeedBasisApply(basis->basis_chebyshev, 1, CEED_NOTRANSPOSE, CEED_EVAL_INTERP, u, basis->vec_chebyshev)); + + // -- Evaluate Chebyshev polynomials at arbitrary points + CeedCall(CeedVectorGetArrayRead(basis->vec_chebyshev, CEED_MEM_HOST, &chebyshev_coeffs)); + CeedCall(CeedVectorGetArrayRead(x_ref, CEED_MEM_HOST, &x_array_read)); + CeedCall(CeedVectorGetArrayWrite(v, CEED_MEM_HOST, &v_array)); + switch (eval_mode) { + case CEED_EVAL_INTERP: { + CeedScalar tmp[2][num_comp * CeedIntPow(Q_1d, dim)], chebyshev_x[Q_1d]; + + // ---- Values at point + for (CeedInt p = 0; p < total_num_points; p++) { + CeedInt pre = num_comp * CeedIntPow(Q_1d, dim - 1), post = 1; + + for (CeedInt d = 0; d < dim; d++) { + // ------ Tensor contract with current Chebyshev polynomial values + CeedCall(CeedChebyshevPolynomialsAtPoint(x_array_read[d * total_num_points + p], Q_1d, chebyshev_x)); + CeedCall(CeedTensorContractApply(basis->contract, pre, Q_1d, post, 1, chebyshev_x, t_mode, false, + d == 0 ? chebyshev_coeffs : tmp[d % 2], tmp[(d + 1) % 2])); + pre /= Q_1d; + post *= 1; + } + for (CeedInt c = 0; c < num_comp; c++) v_array[c * total_num_points + p] = tmp[dim % 2][c]; + } + break; + } + case CEED_EVAL_GRAD: { + CeedScalar tmp[2][num_comp * CeedIntPow(Q_1d, dim)], chebyshev_x[Q_1d]; + + // ---- Values at point + for (CeedInt p = 0; p < total_num_points; p++) { + // Dim**2 contractions, apply grad when pass == dim + for (CeedInt pass = 0; pass < dim; pass++) { + CeedInt pre = num_comp * CeedIntPow(Q_1d, dim - 1), post = 1; + + for (CeedInt d = 0; d < dim; d++) { + // ------ Tensor contract with current Chebyshev polynomial values + if (pass == d) CeedCall(CeedChebyshevDerivativeAtPoint(x_array_read[d * total_num_points + p], Q_1d, chebyshev_x)); + else CeedCall(CeedChebyshevPolynomialsAtPoint(x_array_read[d * total_num_points + p], Q_1d, chebyshev_x)); + CeedCall(CeedTensorContractApply(basis->contract, pre, Q_1d, post, 1, chebyshev_x, t_mode, false, + d == 0 ? chebyshev_coeffs : tmp[d % 2], tmp[(d + 1) % 2])); + pre /= Q_1d; + post *= 1; + } + for (CeedInt c = 0; c < num_comp; c++) v_array[(pass * num_comp + c) * total_num_points + p] = tmp[dim % 2][c]; + } + } + break; + } + default: + // Nothing to do, excluded above + break; + } + CeedCall(CeedVectorRestoreArrayRead(basis->vec_chebyshev, &chebyshev_coeffs)); + CeedCall(CeedVectorRestoreArrayRead(x_ref, &x_array_read)); + CeedCall(CeedVectorRestoreArray(v, &v_array)); + break; + } + case CEED_TRANSPOSE: { + // Note: No switch on e_mode here because only CEED_EVAL_INTERP is supported at this time + // Arbitrary points to nodes + CeedScalar *chebyshev_coeffs; + const CeedScalar *u_array, *x_array_read; + + // -- Transpose of evaluation of Chebyshev polynomials at arbitrary points + CeedCall(CeedVectorGetArrayWrite(basis->vec_chebyshev, CEED_MEM_HOST, &chebyshev_coeffs)); + CeedCall(CeedVectorGetArrayRead(x_ref, CEED_MEM_HOST, &x_array_read)); + CeedCall(CeedVectorGetArrayRead(u, CEED_MEM_HOST, &u_array)); + + switch (eval_mode) { + case CEED_EVAL_INTERP: { + CeedScalar tmp[2][num_comp * CeedIntPow(Q_1d, dim)], chebyshev_x[Q_1d]; + + // ---- Values at point + for (CeedInt p = 0; p < total_num_points; p++) { + CeedInt pre = num_comp * 1, post = 1; + + for (CeedInt c = 0; c < num_comp; c++) tmp[0][c] = u_array[c * total_num_points + p]; + for (CeedInt d = 0; d < dim; d++) { + // ------ Tensor contract with current Chebyshev polynomial values + CeedCall(CeedChebyshevPolynomialsAtPoint(x_array_read[d * total_num_points + p], Q_1d, chebyshev_x)); + CeedCall(CeedTensorContractApply(basis->contract, pre, 1, post, Q_1d, chebyshev_x, t_mode, p > 0 && d == (dim - 1), tmp[d % 2], + d == (dim - 1) ? chebyshev_coeffs : tmp[(d + 1) % 2])); + pre /= 1; + post *= Q_1d; + } + } + break; + } + case CEED_EVAL_GRAD: { + CeedScalar tmp[2][num_comp * CeedIntPow(Q_1d, dim)], chebyshev_x[Q_1d]; + + // ---- Values at point + for (CeedInt p = 0; p < total_num_points; p++) { + // Dim**2 contractions, apply grad when pass == dim + for (CeedInt pass = 0; pass < dim; pass++) { + CeedInt pre = num_comp * 1, post = 1; + + for (CeedInt c = 0; c < num_comp; c++) tmp[0][c] = u_array[(pass * num_comp + c) * total_num_points + p]; + for (CeedInt d = 0; d < dim; d++) { + // ------ Tensor contract with current Chebyshev polynomial values + if (pass == d) CeedCall(CeedChebyshevDerivativeAtPoint(x_array_read[d * total_num_points + p], Q_1d, chebyshev_x)); + else CeedCall(CeedChebyshevPolynomialsAtPoint(x_array_read[d * total_num_points + p], Q_1d, chebyshev_x)); + CeedCall(CeedTensorContractApply(basis->contract, pre, 1, post, Q_1d, chebyshev_x, t_mode, + (p > 0 || (p == 0 && pass > 0)) && d == (dim - 1), tmp[d % 2], + d == (dim - 1) ? chebyshev_coeffs : tmp[(d + 1) % 2])); + pre /= 1; + post *= Q_1d; + } + } + } + break; + } + default: + // Nothing to do, excluded above + break; + } + CeedCall(CeedVectorRestoreArray(basis->vec_chebyshev, &chebyshev_coeffs)); + CeedCall(CeedVectorRestoreArrayRead(x_ref, &x_array_read)); + CeedCall(CeedVectorRestoreArrayRead(u, &u_array)); + + // -- Interpolate transpose from Chebyshev coefficients + if (apply_add) CeedCall(CeedBasisApplyAdd(basis->basis_chebyshev, 1, CEED_TRANSPOSE, CEED_EVAL_INTERP, basis->vec_chebyshev, v)); + else CeedCall(CeedBasisApply(basis->basis_chebyshev, 1, CEED_TRANSPOSE, CEED_EVAL_INTERP, basis->vec_chebyshev, v)); + break; + } + } + return CEED_ERROR_SUCCESS; +} + +/// @} + +/// ---------------------------------------------------------------------------- +/// Ceed Backend API +/// ---------------------------------------------------------------------------- +/// @addtogroup CeedBasisBackend +/// @{ + +/** + @brief Fallback to a reference implementation for a non tensor-product basis for \f$H^1\f$ discretizations. + This function may only be called inside of a backend `BasisCreateH1` function. + This is used by a backend when the specific parameters for a `CeedBasis` exceed the backend's support, such as + when a `interp` and `grad` matrices require too many bytes to fit into shared memory on a GPU. + + @param[in] ceed `Ceed` object used to create the `CeedBasis` + @param[in] topo Topology of element, e.g. hypercube, simplex, etc + @param[in] num_comp Number of field components (1 for scalar fields) + @param[in] num_nodes Total number of nodes + @param[in] num_qpts Total number of quadrature points + @param[in] interp Row-major (`num_qpts * num_nodes`) matrix expressing the values of nodal basis functions at quadrature points + @param[in] grad Row-major (`dim * num_qpts * num_nodes`) matrix expressing derivatives of nodal basis functions at quadrature points + @param[in] q_ref Array of length `num_qpts * dim` holding the locations of quadrature points on the reference element + @param[in] q_weight Array of length `num_qpts` holding the quadrature weights on the reference element + @param[out] basis Newly created `CeedBasis` + + @return An error code: 0 - success, otherwise - failure + + @ref User +**/ +int CeedBasisCreateH1Fallback(Ceed ceed, CeedElemTopology topo, CeedInt num_comp, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, + const CeedScalar *grad, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) { + CeedInt P = num_nodes, Q = num_qpts, dim = 0; + Ceed delegate; + + CeedCall(CeedGetObjectDelegate(ceed, &delegate, "Basis")); + CeedCheck(delegate, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement BasisCreateH1"); + + CeedCall(CeedReferenceCopy(delegate, &(basis)->obj.ceed)); + CeedCall(CeedBasisGetTopologyDimension(topo, &dim)); + CeedCall(delegate->BasisCreateH1(topo, dim, P, Q, interp, grad, q_ref, q_weight, basis)); + CeedCall(CeedDestroy(&delegate)); + return CEED_ERROR_SUCCESS; +} + +/** + @brief Return collocated gradient matrix + + @param[in] basis `CeedBasis` + @param[out] collo_grad_1d Row-major (`Q_1d * Q_1d`) matrix expressing derivatives of basis functions at quadrature points + + @return An error code: 0 - success, otherwise - failure + + @ref Backend +**/ +int CeedBasisGetCollocatedGrad(CeedBasis basis, CeedScalar *collo_grad_1d) { + Ceed ceed; + CeedInt P_1d, Q_1d; + CeedScalar *interp_1d_pinv; + const CeedScalar *grad_1d, *interp_1d; + + // Note: This function is for backend use, so all errors are terminal and we do not need to clean up memory on failure. + CeedCall(CeedBasisGetCeed(basis, &ceed)); + CeedCall(CeedBasisGetNumNodes1D(basis, &P_1d)); + CeedCall(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); + + // Compute interp_1d^+, pseudoinverse of interp_1d + CeedCall(CeedCalloc(P_1d * Q_1d, &interp_1d_pinv)); + CeedCall(CeedBasisGetInterp1D(basis, &interp_1d)); + CeedCall(CeedMatrixPseudoinverse(ceed, interp_1d, Q_1d, P_1d, interp_1d_pinv)); + CeedCall(CeedBasisGetGrad1D(basis, &grad_1d)); + CeedCall(CeedMatrixMatrixMultiply(ceed, grad_1d, (const CeedScalar *)interp_1d_pinv, collo_grad_1d, Q_1d, Q_1d, P_1d)); + + CeedCall(CeedFree(&interp_1d_pinv)); + CeedCall(CeedDestroy(&ceed)); + return CEED_ERROR_SUCCESS; +} + +/** + @brief Return 1D interpolation matrix to Chebyshev polynomial coefficients on quadrature space + + @param[in] basis `CeedBasis` + @param[out] chebyshev_interp_1d Row-major (`P_1d * Q_1d`) matrix interpolating from basis nodes to Chebyshev polynomial coefficients + + @return An error code: 0 - success, otherwise - failure + + @ref Backend +**/ +int CeedBasisGetChebyshevInterp1D(CeedBasis basis, CeedScalar *chebyshev_interp_1d) { + CeedInt P_1d, Q_1d; + CeedScalar *C, *chebyshev_coeffs_1d_inv; + const CeedScalar *interp_1d, *q_ref_1d; + Ceed ceed; + + CeedCall(CeedBasisGetCeed(basis, &ceed)); + CeedCall(CeedBasisGetNumNodes1D(basis, &P_1d)); + CeedCall(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); + + // Build coefficient matrix + // -- Note: Clang-tidy needs this check + CeedCheck((P_1d > 0) && (Q_1d > 0), ceed, CEED_ERROR_INCOMPATIBLE, "CeedBasis dimensions are malformed"); + CeedCall(CeedCalloc(Q_1d * Q_1d, &C)); + CeedCall(CeedBasisGetQRef(basis, &q_ref_1d)); + for (CeedInt i = 0; i < Q_1d; i++) CeedCall(CeedChebyshevPolynomialsAtPoint(q_ref_1d[i], Q_1d, &C[i * Q_1d])); + + // Compute C^+, pseudoinverse of coefficient matrix + CeedCall(CeedCalloc(Q_1d * Q_1d, &chebyshev_coeffs_1d_inv)); + CeedCall(CeedMatrixPseudoinverse(ceed, C, Q_1d, Q_1d, chebyshev_coeffs_1d_inv)); + + // Build mapping from nodes to Chebyshev coefficients + CeedCall(CeedBasisGetInterp1D(basis, &interp_1d)); + CeedCall(CeedMatrixMatrixMultiply(ceed, chebyshev_coeffs_1d_inv, interp_1d, chebyshev_interp_1d, Q_1d, P_1d, Q_1d)); + + // Cleanup + CeedCall(CeedFree(&C)); + CeedCall(CeedFree(&chebyshev_coeffs_1d_inv)); + CeedCall(CeedDestroy(&ceed)); + return CEED_ERROR_SUCCESS; +} + +/** + @brief Get tensor status for given `CeedBasis` + + @param[in] basis `CeedBasis` + @param[out] is_tensor Variable to store tensor status + + @return An error code: 0 - success, otherwise - failure + + @ref Backend +**/ +int CeedBasisIsTensor(CeedBasis basis, bool *is_tensor) { + *is_tensor = basis->is_tensor_basis; + return CEED_ERROR_SUCCESS; +} + +/** + @brief Determine if given `CeedBasis` has nodes collocated with quadrature points + + @param[in] basis `CeedBasis` + @param[out] is_collocated Variable to store collocated status + + @return An error code: 0 - success, otherwise - failure + + @ref Backend +**/ +int CeedBasisIsCollocated(CeedBasis basis, bool *is_collocated) { + if (basis->is_tensor_basis && (basis->Q_1d == basis->P_1d)) { + *is_collocated = true; + + for (CeedInt i = 0; i < basis->P_1d; i++) { + *is_collocated = *is_collocated && (fabs(basis->interp_1d[i + basis->P_1d * i] - 1.0) < 10 * CEED_EPSILON); + for (CeedInt j = 0; j < basis->Q_1d; j++) { + if (j != i) *is_collocated = *is_collocated && (fabs(basis->interp_1d[j + basis->P_1d * i]) < 10 * CEED_EPSILON); + } + } + } else { + *is_collocated = false; + } + return CEED_ERROR_SUCCESS; +} + +/** + @brief Get backend data of a `CeedBasis` + + @param[in] basis `CeedBasis` + @param[out] data Variable to store data + + @return An error code: 0 - success, otherwise - failure + + @ref Backend +**/ +int CeedBasisGetData(CeedBasis basis, void *data) { + *(void **)data = basis->data; + return CEED_ERROR_SUCCESS; +} + +/** + @brief Set backend data of a `CeedBasis` + + @param[in,out] basis `CeedBasis` + @param[in] data Data to set + + @return An error code: 0 - success, otherwise - failure + + @ref Backend +**/ +int CeedBasisSetData(CeedBasis basis, void *data) { + basis->data = data; + return CEED_ERROR_SUCCESS; +} + +/** + @brief Increment the reference counter for a `CeedBasis` + + @param[in,out] basis `CeedBasis` to increment the reference counter + + @return An error code: 0 - success, otherwise - failure + + @ref Backend +**/ +int CeedBasisReference(CeedBasis basis) { + CeedCall(CeedObjectReference((CeedObject)basis)); + return CEED_ERROR_SUCCESS; +} + +/** + @brief Get number of Q-vector components for given `CeedBasis` + + @param[in] basis `CeedBasis` + @param[in] eval_mode @ref CEED_EVAL_INTERP to use interpolated values, + @ref CEED_EVAL_GRAD to use gradients, + @ref CEED_EVAL_DIV to use divergence, + @ref CEED_EVAL_CURL to use curl + @param[out] q_comp Variable to store number of Q-vector components of basis + + @return An error code: 0 - success, otherwise - failure + + @ref Backend +**/ +int CeedBasisGetNumQuadratureComponents(CeedBasis basis, CeedEvalMode eval_mode, CeedInt *q_comp) { + CeedInt dim; + + CeedCall(CeedBasisGetDimension(basis, &dim)); + switch (eval_mode) { + case CEED_EVAL_INTERP: { + CeedFESpace fe_space; + + CeedCall(CeedBasisGetFESpace(basis, &fe_space)); + *q_comp = (fe_space == CEED_FE_SPACE_H1) ? 1 : dim; + } break; + case CEED_EVAL_GRAD: + *q_comp = dim; + break; + case CEED_EVAL_DIV: + *q_comp = 1; + break; + case CEED_EVAL_CURL: + *q_comp = (dim < 3) ? 1 : dim; + break; + case CEED_EVAL_NONE: + case CEED_EVAL_WEIGHT: + *q_comp = 1; + break; + } + return CEED_ERROR_SUCCESS; +} + +/** + @brief Estimate number of FLOPs required to apply `CeedBasis` in `t_mode` and `eval_mode` + + @param[in] basis `CeedBasis` to estimate FLOPs for + @param[in] t_mode Apply basis or transpose + @param[in] eval_mode @ref CeedEvalMode + @param[in] is_at_points Evaluate the basis at points or quadrature points + @param[in] num_points Number of points basis is evaluated at + @param[out] flops Address of variable to hold FLOPs estimate + + @ref Backend +**/ +int CeedBasisGetFlopsEstimate(CeedBasis basis, CeedTransposeMode t_mode, CeedEvalMode eval_mode, bool is_at_points, CeedInt num_points, + CeedSize *flops) { + bool is_tensor; CeedCall(CeedBasisIsTensor(basis, &is_tensor)); + CeedCheck(!is_at_points || is_tensor, CeedBasisReturnCeed(basis), CEED_ERROR_INCOMPATIBLE, "Can only evaluate tensor-product bases at points"); if (is_tensor) { CeedInt dim, num_comp, P_1d, Q_1d; @@ -450,32 +952,92 @@ int CeedBasisGetFlopsEstimate(CeedBasis basis, CeedTransposeMode t_mode, CeedEva Q_1d = P_1d; } CeedInt tensor_flops = 0, pre = num_comp * CeedIntPow(P_1d, dim - 1), post = 1; + for (CeedInt d = 0; d < dim; d++) { tensor_flops += 2 * pre * P_1d * post * Q_1d; pre /= P_1d; post *= Q_1d; } - switch (eval_mode) { - case CEED_EVAL_NONE: - *flops = 0; - break; - case CEED_EVAL_INTERP: - *flops = tensor_flops; - break; - case CEED_EVAL_GRAD: - *flops = tensor_flops * 2; - break; - case CEED_EVAL_DIV: - case CEED_EVAL_CURL: { - // LCOV_EXCL_START - return CeedError(CeedBasisReturnCeed(basis), CEED_ERROR_INCOMPATIBLE, "Tensor basis evaluation for %s not supported", - CeedEvalModes[eval_mode]); - break; - // LCOV_EXCL_STOP + if (is_at_points) { + bool is_gpu = false; + + { + CeedMemType mem_type; + + CeedCall(CeedGetPreferredMemType(CeedBasisReturnCeed(basis), &mem_type)); + is_gpu = mem_type == CEED_MEM_DEVICE; + } + + CeedInt chebyshev_flops = (Q_1d - 2) * 3 + 1, d_chebyshev_flops = (Q_1d - 2) * 8 + 1; + CeedInt point_tensor_flops = 0, pre = CeedIntPow(Q_1d, dim - 1), post = 1; + + for (CeedInt d = 0; d < dim; d++) { + point_tensor_flops += 2 * pre * Q_1d * post * 1; + pre /= P_1d; + post *= Q_1d; + } + + switch (eval_mode) { + case CEED_EVAL_NONE: + *flops = 0; + break; + case CEED_EVAL_INTERP: { + *flops = tensor_flops + num_points * num_comp * (point_tensor_flops + (t_mode == CEED_TRANSPOSE ? CeedIntPow(Q_1d, dim) : 0)); + if (dim == 3 && is_gpu) { + *flops += num_points * Q_1d * + (chebyshev_flops + num_comp * (2 * chebyshev_flops + 2 * Q_1d * Q_1d + (t_mode == CEED_TRANSPOSE ? 2 * Q_1d + 1 : 3 * Q_1d))); + } else { + *flops += num_points * (is_gpu ? num_comp : 1) * dim * chebyshev_flops; + } + break; + } + case CEED_EVAL_GRAD: { + *flops = tensor_flops + num_points * num_comp * (point_tensor_flops + (t_mode == CEED_TRANSPOSE ? CeedIntPow(Q_1d, dim) : 0)); + if (dim == 3 && is_gpu) { + CeedInt inner_flops = + dim * (2 * Q_1d * Q_1d + (t_mode == CEED_TRANSPOSE ? 2 : 3) * Q_1d) + (dim - 1) * (2 * chebyshev_flops + d_chebyshev_flops); + + *flops += num_points * Q_1d * (chebyshev_flops + d_chebyshev_flops + num_comp * (inner_flops + (t_mode == CEED_TRANSPOSE ? 1 : 0))); + } else { + *flops += num_points * (is_gpu ? num_comp : 1) * dim * (d_chebyshev_flops + (dim - 1) * chebyshev_flops); + } + break; + } + case CEED_EVAL_DIV: + case CEED_EVAL_CURL: { + // LCOV_EXCL_START + return CeedError(CeedBasisReturnCeed(basis), CEED_ERROR_INCOMPATIBLE, "Tensor basis evaluation for %s not supported at points", + CeedEvalModes[eval_mode]); + break; + // LCOV_EXCL_STOP + } + case CEED_EVAL_WEIGHT: + *flops = num_points; + break; + } + } else { + switch (eval_mode) { + case CEED_EVAL_NONE: + *flops = 0; + break; + case CEED_EVAL_INTERP: + *flops = tensor_flops; + break; + case CEED_EVAL_GRAD: + *flops = tensor_flops * 2; + break; + case CEED_EVAL_DIV: + case CEED_EVAL_CURL: { + // LCOV_EXCL_START + return CeedError(CeedBasisReturnCeed(basis), CEED_ERROR_INCOMPATIBLE, "Tensor basis evaluation for %s not supported", + CeedEvalModes[eval_mode]); + break; + // LCOV_EXCL_STOP + } + case CEED_EVAL_WEIGHT: + *flops = dim * CeedIntPow(Q_1d, dim); + break; } - case CEED_EVAL_WEIGHT: - *flops = dim * CeedIntPow(Q_1d, dim); - break; } } else { CeedInt dim, num_comp, q_comp, num_nodes, num_qpts; @@ -977,8 +1539,9 @@ int CeedBasisCreateTensorH1(Ceed ceed, CeedInt dim, CeedInt num_comp, CeedInt P_ Ceed delegate; CeedCall(CeedGetObjectDelegate(ceed, &delegate, "Basis")); - CeedCheck(delegate, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not support BasisCreateTensorH1"); + CeedCheck(delegate, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement BasisCreateTensorH1"); CeedCall(CeedBasisCreateTensorH1(delegate, dim, num_comp, P_1d, Q_1d, interp_1d, grad_1d, q_ref_1d, q_weight_1d, basis)); + CeedCall(CeedDestroy(&delegate)); return CEED_ERROR_SUCCESS; } @@ -990,8 +1553,7 @@ int CeedBasisCreateTensorH1(Ceed ceed, CeedInt dim, CeedInt num_comp, CeedInt P_ CeedElemTopology topo = dim == 1 ? CEED_TOPOLOGY_LINE : dim == 2 ? CEED_TOPOLOGY_QUAD : CEED_TOPOLOGY_HEX; CeedCall(CeedCalloc(1, basis)); - CeedCall(CeedReferenceCopy(ceed, &(*basis)->ceed)); - (*basis)->ref_count = 1; + CeedCall(CeedObjectCreate(ceed, CeedBasisView_Object, CeedBasisDestroy_Object, &(*basis)->obj)); (*basis)->is_tensor_basis = true; (*basis)->dim = dim; (*basis)->topo = topo; @@ -1094,13 +1656,13 @@ int CeedBasisCreateTensorH1Lagrange(Ceed ceed, CeedInt dim, CeedInt num_comp, Ce @brief Create a non tensor-product basis for \f$H^1\f$ discretizations @param[in] ceed `Ceed` object used to create the `CeedBasis` - @param[in] topo Topology of element, e.g. hypercube, simplex, ect + @param[in] topo Topology of element, e.g. hypercube, simplex, etc @param[in] num_comp Number of field components (1 for scalar fields) @param[in] num_nodes Total number of nodes @param[in] num_qpts Total number of quadrature points @param[in] interp Row-major (`num_qpts * num_nodes`) matrix expressing the values of nodal basis functions at quadrature points @param[in] grad Row-major (`dim * num_qpts * num_nodes`) matrix expressing derivatives of nodal basis functions at quadrature points - @param[in] q_ref Array of length `num_qpts` * dim holding the locations of quadrature points on the reference element + @param[in] q_ref Array of length `num_qpts * dim` holding the locations of quadrature points on the reference element @param[in] q_weight Array of length `num_qpts` holding the quadrature weights on the reference element @param[out] basis Address of the variable where the newly created `CeedBasis` will be stored @@ -1116,8 +1678,9 @@ int CeedBasisCreateH1(Ceed ceed, CeedElemTopology topo, CeedInt num_comp, CeedIn Ceed delegate; CeedCall(CeedGetObjectDelegate(ceed, &delegate, "Basis")); - CeedCheck(delegate, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not support BasisCreateH1"); + CeedCheck(delegate, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement BasisCreateH1"); CeedCall(CeedBasisCreateH1(delegate, topo, num_comp, num_nodes, num_qpts, interp, grad, q_ref, q_weight, basis)); + CeedCall(CeedDestroy(&delegate)); return CEED_ERROR_SUCCESS; } @@ -1128,8 +1691,7 @@ int CeedBasisCreateH1(Ceed ceed, CeedElemTopology topo, CeedInt num_comp, CeedIn CeedCall(CeedBasisGetTopologyDimension(topo, &dim)); CeedCall(CeedCalloc(1, basis)); - CeedCall(CeedReferenceCopy(ceed, &(*basis)->ceed)); - (*basis)->ref_count = 1; + CeedCall(CeedObjectCreate(ceed, CeedBasisView_Object, CeedBasisDestroy_Object, &(*basis)->obj)); (*basis)->is_tensor_basis = false; (*basis)->dim = dim; (*basis)->topo = topo; @@ -1177,6 +1739,7 @@ int CeedBasisCreateHdiv(Ceed ceed, CeedElemTopology topo, CeedInt num_comp, Ceed CeedCall(CeedGetObjectDelegate(ceed, &delegate, "Basis")); CeedCheck(delegate, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement BasisCreateHdiv"); CeedCall(CeedBasisCreateHdiv(delegate, topo, num_comp, num_nodes, num_qpts, interp, div, q_ref, q_weight, basis)); + CeedCall(CeedDestroy(&delegate)); return CEED_ERROR_SUCCESS; } @@ -1187,8 +1750,7 @@ int CeedBasisCreateHdiv(Ceed ceed, CeedElemTopology topo, CeedInt num_comp, Ceed CeedCall(CeedBasisGetTopologyDimension(topo, &dim)); CeedCall(CeedCalloc(1, basis)); - CeedCall(CeedReferenceCopy(ceed, &(*basis)->ceed)); - (*basis)->ref_count = 1; + CeedCall(CeedObjectCreate(ceed, CeedBasisView_Object, CeedBasisDestroy_Object, &(*basis)->obj)); (*basis)->is_tensor_basis = false; (*basis)->dim = dim; (*basis)->topo = topo; @@ -1236,6 +1798,7 @@ int CeedBasisCreateHcurl(Ceed ceed, CeedElemTopology topo, CeedInt num_comp, Cee CeedCall(CeedGetObjectDelegate(ceed, &delegate, "Basis")); CeedCheck(delegate, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement BasisCreateHcurl"); CeedCall(CeedBasisCreateHcurl(delegate, topo, num_comp, num_nodes, num_qpts, interp, curl, q_ref, q_weight, basis)); + CeedCall(CeedDestroy(&delegate)); return CEED_ERROR_SUCCESS; } @@ -1247,8 +1810,7 @@ int CeedBasisCreateHcurl(Ceed ceed, CeedElemTopology topo, CeedInt num_comp, Cee curl_comp = (dim < 3) ? 1 : dim; CeedCall(CeedCalloc(1, basis)); - CeedCall(CeedReferenceCopy(ceed, &(*basis)->ceed)); - (*basis)->ref_count = 1; + CeedCall(CeedObjectCreate(ceed, CeedBasisView_Object, CeedBasisDestroy_Object, &(*basis)->obj)); (*basis)->is_tensor_basis = false; (*basis)->dim = dim; (*basis)->topo = topo; @@ -1281,6 +1843,8 @@ int CeedBasisCreateHcurl(Ceed ceed, CeedElemTopology topo, CeedInt num_comp, Cee Note: `basis_project` will have the same number of components as `basis_from`, regardless of the number of components that `basis_to` has. If `basis_from` has 3 components and `basis_to` has 5 components, then `basis_project` will have 3 components. + Note: If either `basis_from` or `basis_to` are non-tensor, then `basis_project` will also be non-tensor + @param[in] basis_from `CeedBasis` to prolong from @param[in] basis_to `CeedBasis` to prolong to @param[out] basis_project Address of the variable where the newly created `CeedBasis` will be stored @@ -1291,9 +1855,9 @@ int CeedBasisCreateHcurl(Ceed ceed, CeedElemTopology topo, CeedInt num_comp, Cee **/ int CeedBasisCreateProjection(CeedBasis basis_from, CeedBasis basis_to, CeedBasis *basis_project) { Ceed ceed; - bool is_tensor; + bool create_tensor; CeedInt dim, num_comp; - CeedScalar *q_ref, *q_weight, *interp_project, *grad_project; + CeedScalar *interp_project, *grad_project; CeedCall(CeedBasisGetCeed(basis_to, &ceed)); @@ -1301,35 +1865,36 @@ int CeedBasisCreateProjection(CeedBasis basis_from, CeedBasis basis_to, CeedBasi CeedCall(CeedBasisCreateProjectionMatrices(basis_from, basis_to, &interp_project, &grad_project)); // Build basis - CeedCall(CeedBasisIsTensor(basis_to, &is_tensor)); + { + bool is_tensor_to, is_tensor_from; + + CeedCall(CeedBasisIsTensor(basis_to, &is_tensor_to)); + CeedCall(CeedBasisIsTensor(basis_from, &is_tensor_from)); + create_tensor = is_tensor_from && is_tensor_to; + } CeedCall(CeedBasisGetDimension(basis_to, &dim)); CeedCall(CeedBasisGetNumComponents(basis_from, &num_comp)); - if (is_tensor) { + if (create_tensor) { CeedInt P_1d_to, P_1d_from; CeedCall(CeedBasisGetNumNodes1D(basis_from, &P_1d_from)); CeedCall(CeedBasisGetNumNodes1D(basis_to, &P_1d_to)); - CeedCall(CeedCalloc(P_1d_to, &q_ref)); - CeedCall(CeedCalloc(P_1d_to, &q_weight)); - CeedCall(CeedBasisCreateTensorH1(ceed, dim, num_comp, P_1d_from, P_1d_to, interp_project, grad_project, q_ref, q_weight, basis_project)); + CeedCall(CeedBasisCreateTensorH1(ceed, dim, num_comp, P_1d_from, P_1d_to, interp_project, grad_project, NULL, NULL, basis_project)); } else { // Even if basis_to and basis_from are not H1, the resulting basis is H1 for interpolation to work CeedInt num_nodes_to, num_nodes_from; CeedElemTopology topo; - CeedCall(CeedBasisGetTopology(basis_to, &topo)); + CeedCall(CeedBasisGetTopology(basis_from, &topo)); CeedCall(CeedBasisGetNumNodes(basis_from, &num_nodes_from)); CeedCall(CeedBasisGetNumNodes(basis_to, &num_nodes_to)); - CeedCall(CeedCalloc(num_nodes_to * dim, &q_ref)); - CeedCall(CeedCalloc(num_nodes_to, &q_weight)); - CeedCall(CeedBasisCreateH1(ceed, topo, num_comp, num_nodes_from, num_nodes_to, interp_project, grad_project, q_ref, q_weight, basis_project)); + CeedCall(CeedBasisCreateH1(ceed, topo, num_comp, num_nodes_from, num_nodes_to, interp_project, grad_project, NULL, NULL, basis_project)); } // Cleanup CeedCall(CeedFree(&interp_project)); CeedCall(CeedFree(&grad_project)); - CeedCall(CeedFree(&q_ref)); - CeedCall(CeedFree(&q_weight)); + CeedCall(CeedDestroy(&ceed)); return CEED_ERROR_SUCCESS; } @@ -1353,6 +1918,36 @@ int CeedBasisReferenceCopy(CeedBasis basis, CeedBasis *basis_copy) { return CEED_ERROR_SUCCESS; } +/** + @brief Set the number of tabs to indent for @ref CeedBasisView() output + + @param[in] basis `CeedBasis` to set the number of view tabs + @param[in] num_tabs Number of view tabs to set + + @return Error code: 0 - success, otherwise - failure + + @ref User +**/ +int CeedBasisSetNumViewTabs(CeedBasis basis, CeedInt num_tabs) { + CeedCall(CeedObjectSetNumViewTabs((CeedObject)basis, num_tabs)); + return CEED_ERROR_SUCCESS; +} + +/** + @brief Get the number of tabs to indent for @ref CeedBasisView() output + + @param[in] basis `CeedBasis` to get the number of view tabs + @param[out] num_tabs Number of view tabs + + @return Error code: 0 - success, otherwise - failure + + @ref User +**/ +int CeedBasisGetNumViewTabs(CeedBasis basis, CeedInt *num_tabs) { + CeedCall(CeedObjectGetNumViewTabs((CeedObject)basis, num_tabs)); + return CEED_ERROR_SUCCESS; +} + /** @brief View a `CeedBasis` @@ -1365,6 +1960,7 @@ int CeedBasisReferenceCopy(CeedBasis basis, CeedBasis *basis_copy) { **/ int CeedBasisView(CeedBasis basis, FILE *stream) { bool is_tensor_basis; + char *tabs = NULL; CeedElemTopology topo; CeedFESpace fe_space; @@ -1373,14 +1969,22 @@ int CeedBasisView(CeedBasis basis, FILE *stream) { CeedCall(CeedBasisGetTopology(basis, &topo)); CeedCall(CeedBasisGetFESpace(basis, &fe_space)); + { + CeedInt num_tabs = 0; + + CeedCall(CeedBasisGetNumViewTabs(basis, &num_tabs)); + CeedCall(CeedCalloc(CEED_TAB_WIDTH * num_tabs + 1, &tabs)); + for (CeedInt i = 0; i < CEED_TAB_WIDTH * num_tabs; i++) tabs[i] = ' '; + } + // Print FE space and element topology of the basis - fprintf(stream, "CeedBasis in a %s on a %s element\n", CeedFESpaces[fe_space], CeedElemTopologies[topo]); + fprintf(stream, "%sCeedBasis in a %s on a %s element\n", tabs, CeedFESpaces[fe_space], CeedElemTopologies[topo]); if (is_tensor_basis) { - fprintf(stream, " P: %" CeedInt_FMT "\n Q: %" CeedInt_FMT "\n", basis->P_1d, basis->Q_1d); + fprintf(stream, "%s P: %" CeedInt_FMT "\n%s Q: %" CeedInt_FMT "\n", tabs, basis->P_1d, tabs, basis->Q_1d); } else { - fprintf(stream, " P: %" CeedInt_FMT "\n Q: %" CeedInt_FMT "\n", basis->P, basis->Q); + fprintf(stream, "%s P: %" CeedInt_FMT "\n%s Q: %" CeedInt_FMT "\n", tabs, basis->P, tabs, basis->Q); } - fprintf(stream, " dimension: %" CeedInt_FMT "\n field components: %" CeedInt_FMT "\n", basis->dim, basis->num_comp); + fprintf(stream, "%s dimension: %" CeedInt_FMT "\n%s field components: %" CeedInt_FMT "\n", tabs, basis->dim, tabs, basis->num_comp); // Print quadrature data, interpolation/gradient/divergence/curl of the basis if (is_tensor_basis) { // tensor basis CeedInt P_1d, Q_1d; @@ -1393,10 +1997,10 @@ int CeedBasisView(CeedBasis basis, FILE *stream) { CeedCall(CeedBasisGetInterp1D(basis, &interp_1d)); CeedCall(CeedBasisGetGrad1D(basis, &grad_1d)); - CeedCall(CeedScalarView("qref1d", "\t% 12.8f", 1, Q_1d, q_ref_1d, stream)); - CeedCall(CeedScalarView("qweight1d", "\t% 12.8f", 1, Q_1d, q_weight_1d, stream)); - CeedCall(CeedScalarView("interp1d", "\t% 12.8f", Q_1d, P_1d, interp_1d, stream)); - CeedCall(CeedScalarView("grad1d", "\t% 12.8f", Q_1d, P_1d, grad_1d, stream)); + CeedCall(CeedScalarView("qref1d", "\t% 12.8f", 1, Q_1d, q_ref_1d, tabs, stream)); + CeedCall(CeedScalarView("qweight1d", "\t% 12.8f", 1, Q_1d, q_weight_1d, tabs, stream)); + CeedCall(CeedScalarView("interp1d", "\t% 12.8f", Q_1d, P_1d, interp_1d, tabs, stream)); + CeedCall(CeedScalarView("grad1d", "\t% 12.8f", Q_1d, P_1d, grad_1d, tabs, stream)); } else { // non-tensor basis CeedInt P, Q, dim, q_comp; const CeedScalar *q_ref, *q_weight, *interp, *grad, *div, *curl; @@ -1411,23 +2015,24 @@ int CeedBasisView(CeedBasis basis, FILE *stream) { CeedCall(CeedBasisGetDiv(basis, &div)); CeedCall(CeedBasisGetCurl(basis, &curl)); - CeedCall(CeedScalarView("qref", "\t% 12.8f", 1, Q * dim, q_ref, stream)); - CeedCall(CeedScalarView("qweight", "\t% 12.8f", 1, Q, q_weight, stream)); + CeedCall(CeedScalarView("qref", "\t% 12.8f", 1, Q * dim, q_ref, tabs, stream)); + CeedCall(CeedScalarView("qweight", "\t% 12.8f", 1, Q, q_weight, tabs, stream)); CeedCall(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp)); - CeedCall(CeedScalarView("interp", "\t% 12.8f", q_comp * Q, P, interp, stream)); + CeedCall(CeedScalarView("interp", "\t% 12.8f", q_comp * Q, P, interp, tabs, stream)); if (grad) { CeedCall(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_GRAD, &q_comp)); - CeedCall(CeedScalarView("grad", "\t% 12.8f", q_comp * Q, P, grad, stream)); + CeedCall(CeedScalarView("grad", "\t% 12.8f", q_comp * Q, P, grad, tabs, stream)); } if (div) { CeedCall(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_DIV, &q_comp)); - CeedCall(CeedScalarView("div", "\t% 12.8f", q_comp * Q, P, div, stream)); + CeedCall(CeedScalarView("div", "\t% 12.8f", q_comp * Q, P, div, tabs, stream)); } if (curl) { CeedCall(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_CURL, &q_comp)); - CeedCall(CeedScalarView("curl", "\t% 12.8f", q_comp * Q, P, curl, stream)); + CeedCall(CeedScalarView("curl", "\t% 12.8f", q_comp * Q, P, curl, tabs, stream)); } } + CeedCall(CeedFree(&tabs)); return CEED_ERROR_SUCCESS; } @@ -1453,45 +2058,38 @@ int CeedBasisView(CeedBasis basis, FILE *stream) { @ref User **/ int CeedBasisApply(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, CeedVector v) { - CeedInt dim, num_comp, q_comp, num_nodes, num_qpts; - CeedSize u_length = 0, v_length; - Ceed ceed; - - CeedCall(CeedBasisGetCeed(basis, &ceed)); - CeedCall(CeedBasisGetDimension(basis, &dim)); - CeedCall(CeedBasisGetNumComponents(basis, &num_comp)); - CeedCall(CeedBasisGetNumQuadratureComponents(basis, eval_mode, &q_comp)); - CeedCall(CeedBasisGetNumNodes(basis, &num_nodes)); - CeedCall(CeedBasisGetNumQuadraturePoints(basis, &num_qpts)); - CeedCall(CeedVectorGetLength(v, &v_length)); - if (u) CeedCall(CeedVectorGetLength(u, &u_length)); + CeedCall(CeedBasisApplyCheckDims(basis, num_elem, t_mode, eval_mode, u, v)); + CeedCheck(basis->Apply, CeedBasisReturnCeed(basis), CEED_ERROR_UNSUPPORTED, "Backend does not support CeedBasisApply"); + CeedCall(basis->Apply(basis, num_elem, t_mode, eval_mode, u, v)); + return CEED_ERROR_SUCCESS; +} - CeedCheck(basis->Apply, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not support CeedBasisApply"); +/** + @brief Apply basis evaluation from quadrature points to nodes and sum into target vector - // Check compatibility of topological and geometrical dimensions - CeedCheck((t_mode == CEED_TRANSPOSE && v_length % num_nodes == 0 && u_length % num_qpts == 0) || - (t_mode == CEED_NOTRANSPOSE && u_length % num_nodes == 0 && v_length % num_qpts == 0), - ceed, CEED_ERROR_DIMENSION, "Length of input/output vectors incompatible with basis dimensions"); + @param[in] basis `CeedBasis` to evaluate + @param[in] num_elem The number of elements to apply the basis evaluation to; + the backend will specify the ordering in @ref CeedElemRestrictionCreate() + @param[in] t_mode @ref CEED_TRANSPOSE to apply the transpose, mapping from quadrature points to nodes; + @ref CEED_NOTRANSPOSE is not valid for `CeedBasisApplyAdd()` + @param[in] eval_mode @ref CEED_EVAL_NONE to use values directly, + @ref CEED_EVAL_INTERP to use interpolated values, + @ref CEED_EVAL_GRAD to use gradients, + @ref CEED_EVAL_DIV to use divergence, + @ref CEED_EVAL_CURL to use curl, + @ref CEED_EVAL_WEIGHT to use quadrature weights + @param[in] u Input `CeedVector` + @param[out] v Output `CeedVector` to sum into - // Check vector lengths to prevent out of bounds issues - bool has_good_dims = true; - switch (eval_mode) { - case CEED_EVAL_NONE: - case CEED_EVAL_INTERP: - case CEED_EVAL_GRAD: - case CEED_EVAL_DIV: - case CEED_EVAL_CURL: - has_good_dims = - ((t_mode == CEED_TRANSPOSE && u_length >= num_elem * num_comp * num_qpts * q_comp && v_length >= num_elem * num_comp * num_nodes) || - (t_mode == CEED_NOTRANSPOSE && v_length >= num_elem * num_qpts * num_comp * q_comp && u_length >= num_elem * num_comp * num_nodes)); - break; - case CEED_EVAL_WEIGHT: - has_good_dims = v_length >= num_elem * num_qpts; - break; - } - CeedCheck(has_good_dims, ceed, CEED_ERROR_DIMENSION, "Input/output vectors too short for basis and evaluation mode"); + @return An error code: 0 - success, otherwise - failure - CeedCall(basis->Apply(basis, num_elem, t_mode, eval_mode, u, v)); + @ref User +**/ +int CeedBasisApplyAdd(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, CeedVector v) { + CeedCheck(t_mode == CEED_TRANSPOSE, CeedBasisReturnCeed(basis), CEED_ERROR_UNSUPPORTED, "CeedBasisApplyAdd only supports CEED_TRANSPOSE"); + CeedCall(CeedBasisApplyCheckDims(basis, num_elem, t_mode, eval_mode, u, v)); + CeedCheck(basis->ApplyAdd, CeedBasisReturnCeed(basis), CEED_ERROR_UNSUPPORTED, "Backend does not implement CeedBasisApplyAdd"); + CeedCall(basis->ApplyAdd(basis, num_elem, t_mode, eval_mode, u, v)); return CEED_ERROR_SUCCESS; } @@ -1499,7 +2097,9 @@ int CeedBasisApply(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, @brief Apply basis evaluation from nodes to arbitrary points @param[in] basis `CeedBasis` to evaluate - @param[in] num_points The number of points to apply the basis evaluation to + @param[in] num_elem The number of elements to apply the basis evaluation to; + the backend will specify the ordering in @ref CeedElemRestrictionCreate() + @param[in] num_points Array of the number of points to apply the basis evaluation to in each element, size `num_elem` @param[in] t_mode @ref CEED_NOTRANSPOSE to evaluate from nodes to points; @ref CEED_TRANSPOSE to apply the transpose, mapping from points to nodes @param[in] eval_mode @ref CEED_EVAL_INTERP to use interpolated values, @@ -1513,259 +2113,45 @@ int CeedBasisApply(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, @ref User **/ -int CeedBasisApplyAtPoints(CeedBasis basis, CeedInt num_points, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, - CeedVector v) { - bool is_tensor_basis; - CeedInt dim, num_comp, num_q_comp, num_nodes, P_1d = 1, Q_1d = 1; - CeedSize x_length = 0, u_length = 0, v_length; - Ceed ceed; - - CeedCall(CeedBasisGetCeed(basis, &ceed)); - CeedCall(CeedBasisGetDimension(basis, &dim)); - CeedCall(CeedBasisGetNumNodes1D(basis, &P_1d)); - CeedCall(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); - CeedCall(CeedBasisGetNumComponents(basis, &num_comp)); - CeedCall(CeedBasisGetNumQuadratureComponents(basis, eval_mode, &num_q_comp)); - CeedCall(CeedBasisGetNumNodes(basis, &num_nodes)); - CeedCall(CeedVectorGetLength(v, &v_length)); - if (x_ref != CEED_VECTOR_NONE) CeedCall(CeedVectorGetLength(x_ref, &x_length)); - if (u != CEED_VECTOR_NONE) CeedCall(CeedVectorGetLength(u, &u_length)); - - // Check compatibility of topological and geometrical dimensions - CeedCheck((t_mode == CEED_TRANSPOSE && v_length % num_nodes == 0) || (t_mode == CEED_NOTRANSPOSE && u_length % num_nodes == 0) || - (eval_mode == CEED_EVAL_WEIGHT), - ceed, CEED_ERROR_DIMENSION, "Length of input/output vectors incompatible with basis dimensions and number of points"); - - // Check compatibility coordinates vector - CeedCheck((x_length >= num_points * dim) || (eval_mode == CEED_EVAL_WEIGHT), ceed, CEED_ERROR_DIMENSION, - "Length of reference coordinate vector incompatible with basis dimension and number of points"); - - // Check CEED_EVAL_WEIGHT only on CEED_NOTRANSPOSE - CeedCheck(eval_mode != CEED_EVAL_WEIGHT || t_mode == CEED_NOTRANSPOSE, ceed, CEED_ERROR_UNSUPPORTED, - "CEED_EVAL_WEIGHT only supported with CEED_NOTRANSPOSE"); - - // Check vector lengths to prevent out of bounds issues - bool has_good_dims = true; - switch (eval_mode) { - case CEED_EVAL_INTERP: - has_good_dims = ((t_mode == CEED_TRANSPOSE && (u_length >= num_points * num_q_comp || v_length >= num_nodes * num_comp)) || - (t_mode == CEED_NOTRANSPOSE && (v_length >= num_points * num_q_comp || u_length >= num_nodes * num_comp))); - break; - case CEED_EVAL_GRAD: - has_good_dims = ((t_mode == CEED_TRANSPOSE && (u_length >= num_points * num_q_comp * dim || v_length >= num_nodes * num_comp)) || - (t_mode == CEED_NOTRANSPOSE && (v_length >= num_points * num_q_comp * dim || u_length >= num_nodes * num_comp))); - break; - case CEED_EVAL_WEIGHT: - has_good_dims = t_mode == CEED_NOTRANSPOSE && (v_length >= num_points); - break; - // LCOV_EXCL_START - case CEED_EVAL_NONE: - case CEED_EVAL_DIV: - case CEED_EVAL_CURL: - return CeedError(ceed, CEED_ERROR_UNSUPPORTED, "Evaluation at arbitrary points not supported for %s", CeedEvalModes[eval_mode]); - // LCOV_EXCL_STOP - } - CeedCheck(has_good_dims, ceed, CEED_ERROR_DIMENSION, "Input/output vectors too short for basis and evaluation mode"); - - // Backend method +int CeedBasisApplyAtPoints(CeedBasis basis, CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, CeedEvalMode eval_mode, + CeedVector x_ref, CeedVector u, CeedVector v) { + CeedCall(CeedBasisApplyAtPointsCheckDims(basis, num_elem, num_points, t_mode, eval_mode, x_ref, u, v)); if (basis->ApplyAtPoints) { - CeedCall(basis->ApplyAtPoints(basis, num_points, t_mode, eval_mode, x_ref, u, v)); - return CEED_ERROR_SUCCESS; - } - - // Default implementation - CeedCall(CeedBasisIsTensor(basis, &is_tensor_basis)); - CeedCheck(is_tensor_basis, ceed, CEED_ERROR_UNSUPPORTED, "Evaluation at arbitrary points only supported for tensor product bases"); - if (eval_mode == CEED_EVAL_WEIGHT) { - CeedCall(CeedVectorSetValue(v, 1.0)); - return CEED_ERROR_SUCCESS; - } - if (!basis->basis_chebyshev) { - // Build matrix mapping from quadrature point values to Chebyshev coefficients - CeedScalar *C, *chebyshev_coeffs_1d_inv; - const CeedScalar *q_ref_1d; - - // Build coefficient matrix - // -- Note: Clang-tidy needs this check because it does not understand the is_tensor_basis check above - CeedCheck(P_1d > 0 && Q_1d > 0, ceed, CEED_ERROR_INCOMPATIBLE, "CeedBasis dimensions are malformed"); - CeedCall(CeedCalloc(Q_1d * Q_1d, &C)); - CeedCall(CeedBasisGetQRef(basis, &q_ref_1d)); - for (CeedInt i = 0; i < Q_1d; i++) CeedCall(CeedChebyshevPolynomialsAtPoint(q_ref_1d[i], Q_1d, &C[i * Q_1d])); - - // Compute C^+, pseudoinverse of coefficient matrix - CeedCall(CeedCalloc(Q_1d * Q_1d, &chebyshev_coeffs_1d_inv)); - CeedCall(CeedMatrixPseudoinverse(ceed, C, Q_1d, Q_1d, chebyshev_coeffs_1d_inv)); - - // Build basis mapping from nodes to Chebyshev coefficients - CeedScalar *chebyshev_interp_1d, *chebyshev_grad_1d, *chebyshev_q_weight_1d; - const CeedScalar *interp_1d; - - CeedCall(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d)); - CeedCall(CeedCalloc(P_1d * Q_1d, &chebyshev_grad_1d)); - CeedCall(CeedCalloc(Q_1d, &chebyshev_q_weight_1d)); - CeedCall(CeedBasisGetInterp1D(basis, &interp_1d)); - CeedCall(CeedMatrixMatrixMultiply(ceed, chebyshev_coeffs_1d_inv, interp_1d, chebyshev_interp_1d, Q_1d, P_1d, Q_1d)); - - CeedCall(CeedVectorCreate(ceed, num_comp * CeedIntPow(Q_1d, dim), &basis->vec_chebyshev)); - CeedCall(CeedBasisCreateTensorH1(ceed, dim, num_comp, P_1d, Q_1d, chebyshev_interp_1d, chebyshev_grad_1d, q_ref_1d, chebyshev_q_weight_1d, - &basis->basis_chebyshev)); - - // Cleanup - CeedCall(CeedFree(&C)); - CeedCall(CeedFree(&chebyshev_coeffs_1d_inv)); - CeedCall(CeedFree(&chebyshev_interp_1d)); - CeedCall(CeedFree(&chebyshev_grad_1d)); - CeedCall(CeedFree(&chebyshev_q_weight_1d)); - } - - // Create TensorContract object if needed, such as a basis from the GPU backends - if (!basis->contract) { - Ceed ceed_ref; - CeedBasis basis_ref = NULL; - - CeedCall(CeedInit("/cpu/self", &ceed_ref)); - // Only need matching tensor contraction dimensions, any type of basis will work - CeedCall(CeedBasisCreateTensorH1Lagrange(ceed_ref, dim, num_comp, P_1d, Q_1d, CEED_GAUSS, &basis_ref)); - // Note - clang-tidy doesn't know basis_ref->contract must be valid here - CeedCheck(basis_ref && basis_ref->contract, ceed, CEED_ERROR_UNSUPPORTED, "Reference CPU ceed failed to create a tensor contraction object"); - CeedCall(CeedTensorContractReferenceCopy(basis_ref->contract, &basis->contract)); - CeedCall(CeedBasisDestroy(&basis_ref)); - CeedCall(CeedDestroy(&ceed_ref)); + CeedCall(basis->ApplyAtPoints(basis, num_elem, num_points, t_mode, eval_mode, x_ref, u, v)); + } else { + CeedCall(CeedBasisApplyAtPoints_Core(basis, false, num_elem, num_points, t_mode, eval_mode, x_ref, u, v)); } + return CEED_ERROR_SUCCESS; +} - // Basis evaluation - switch (t_mode) { - case CEED_NOTRANSPOSE: { - // Nodes to arbitrary points - CeedScalar *v_array; - const CeedScalar *chebyshev_coeffs, *x_array_read; - - // -- Interpolate to Chebyshev coefficients - CeedCall(CeedBasisApply(basis->basis_chebyshev, 1, CEED_NOTRANSPOSE, CEED_EVAL_INTERP, u, basis->vec_chebyshev)); - - // -- Evaluate Chebyshev polynomials at arbitrary points - CeedCall(CeedVectorGetArrayRead(basis->vec_chebyshev, CEED_MEM_HOST, &chebyshev_coeffs)); - CeedCall(CeedVectorGetArrayRead(x_ref, CEED_MEM_HOST, &x_array_read)); - CeedCall(CeedVectorGetArrayWrite(v, CEED_MEM_HOST, &v_array)); - switch (eval_mode) { - case CEED_EVAL_INTERP: { - CeedScalar tmp[2][num_comp * CeedIntPow(Q_1d, dim)], chebyshev_x[Q_1d]; - - // ---- Values at point - for (CeedInt p = 0; p < num_points; p++) { - CeedInt pre = num_comp * CeedIntPow(Q_1d, dim - 1), post = 1; - - for (CeedInt d = 0; d < dim; d++) { - // ------ Tensor contract with current Chebyshev polynomial values - CeedCall(CeedChebyshevPolynomialsAtPoint(x_array_read[d * num_points + p], Q_1d, chebyshev_x)); - CeedCall(CeedTensorContractApply(basis->contract, pre, Q_1d, post, 1, chebyshev_x, t_mode, false, - d == 0 ? chebyshev_coeffs : tmp[d % 2], tmp[(d + 1) % 2])); - pre /= Q_1d; - post *= 1; - } - for (CeedInt c = 0; c < num_comp; c++) v_array[c * num_points + p] = tmp[dim % 2][c]; - } - break; - } - case CEED_EVAL_GRAD: { - CeedScalar tmp[2][num_comp * CeedIntPow(Q_1d, dim)], chebyshev_x[Q_1d]; - - // ---- Values at point - for (CeedInt p = 0; p < num_points; p++) { - // Dim**2 contractions, apply grad when pass == dim - for (CeedInt pass = 0; pass < dim; pass++) { - CeedInt pre = num_comp * CeedIntPow(Q_1d, dim - 1), post = 1; - - for (CeedInt d = 0; d < dim; d++) { - // ------ Tensor contract with current Chebyshev polynomial values - if (pass == d) CeedCall(CeedChebyshevDerivativeAtPoint(x_array_read[d * num_points + p], Q_1d, chebyshev_x)); - else CeedCall(CeedChebyshevPolynomialsAtPoint(x_array_read[d * num_points + p], Q_1d, chebyshev_x)); - CeedCall(CeedTensorContractApply(basis->contract, pre, Q_1d, post, 1, chebyshev_x, t_mode, false, - d == 0 ? chebyshev_coeffs : tmp[d % 2], tmp[(d + 1) % 2])); - pre /= Q_1d; - post *= 1; - } - for (CeedInt c = 0; c < num_comp; c++) v_array[(pass * num_comp + c) * num_points + p] = tmp[dim % 2][c]; - } - } - break; - } - default: - // Nothing to do, excluded above - break; - } - CeedCall(CeedVectorRestoreArrayRead(basis->vec_chebyshev, &chebyshev_coeffs)); - CeedCall(CeedVectorRestoreArrayRead(x_ref, &x_array_read)); - CeedCall(CeedVectorRestoreArray(v, &v_array)); - break; - } - case CEED_TRANSPOSE: { - // Note: No switch on e_mode here because only CEED_EVAL_INTERP is supported at this time - // Arbitrary points to nodes - CeedScalar *chebyshev_coeffs; - const CeedScalar *u_array, *x_array_read; - - // -- Transpose of evaluation of Chebyshev polynomials at arbitrary points - CeedCall(CeedVectorGetArrayWrite(basis->vec_chebyshev, CEED_MEM_HOST, &chebyshev_coeffs)); - CeedCall(CeedVectorGetArrayRead(x_ref, CEED_MEM_HOST, &x_array_read)); - CeedCall(CeedVectorGetArrayRead(u, CEED_MEM_HOST, &u_array)); - - switch (eval_mode) { - case CEED_EVAL_INTERP: { - CeedScalar tmp[2][num_comp * CeedIntPow(Q_1d, dim)], chebyshev_x[Q_1d]; - - // ---- Values at point - for (CeedInt p = 0; p < num_points; p++) { - CeedInt pre = num_comp * 1, post = 1; - - for (CeedInt c = 0; c < num_comp; c++) tmp[0][c] = u_array[c * num_points + p]; - for (CeedInt d = 0; d < dim; d++) { - // ------ Tensor contract with current Chebyshev polynomial values - CeedCall(CeedChebyshevPolynomialsAtPoint(x_array_read[d * num_points + p], Q_1d, chebyshev_x)); - CeedCall(CeedTensorContractApply(basis->contract, pre, 1, post, Q_1d, chebyshev_x, t_mode, p > 0 && d == (dim - 1), tmp[d % 2], - d == (dim - 1) ? chebyshev_coeffs : tmp[(d + 1) % 2])); - pre /= 1; - post *= Q_1d; - } - } - break; - } - case CEED_EVAL_GRAD: { - CeedScalar tmp[2][num_comp * CeedIntPow(Q_1d, dim)], chebyshev_x[Q_1d]; +/** + @brief Apply basis evaluation from nodes to arbitrary points and sum into target vector - // ---- Values at point - for (CeedInt p = 0; p < num_points; p++) { - // Dim**2 contractions, apply grad when pass == dim - for (CeedInt pass = 0; pass < dim; pass++) { - CeedInt pre = num_comp * 1, post = 1; + @param[in] basis `CeedBasis` to evaluate + @param[in] num_elem The number of elements to apply the basis evaluation to; + the backend will specify the ordering in @ref CeedElemRestrictionCreate() + @param[in] num_points Array of the number of points to apply the basis evaluation to in each element, size `num_elem` + @param[in] t_mode @ref CEED_NOTRANSPOSE to evaluate from nodes to points; + @ref CEED_NOTRANSPOSE is not valid for `CeedBasisApplyAddAtPoints()` + @param[in] eval_mode @ref CEED_EVAL_INTERP to use interpolated values, + @ref CEED_EVAL_GRAD to use gradients, + @ref CEED_EVAL_WEIGHT to use quadrature weights + @param[in] x_ref `CeedVector` holding reference coordinates of each point + @param[in] u Input `CeedVector`, of length `num_nodes * num_comp` for @ref CEED_NOTRANSPOSE + @param[out] v Output `CeedVector`, of length `num_points * num_q_comp` for @ref CEED_NOTRANSPOSE with @ref CEED_EVAL_INTERP - for (CeedInt c = 0; c < num_comp; c++) tmp[0][c] = u_array[(pass * num_comp + c) * num_points + p]; - for (CeedInt d = 0; d < dim; d++) { - // ------ Tensor contract with current Chebyshev polynomial values - if (pass == d) CeedCall(CeedChebyshevDerivativeAtPoint(x_array_read[d * num_points + p], Q_1d, chebyshev_x)); - else CeedCall(CeedChebyshevPolynomialsAtPoint(x_array_read[d * num_points + p], Q_1d, chebyshev_x)); - CeedCall(CeedTensorContractApply(basis->contract, pre, 1, post, Q_1d, chebyshev_x, t_mode, - (p > 0 || (p == 0 && pass > 0)) && d == (dim - 1), tmp[d % 2], - d == (dim - 1) ? chebyshev_coeffs : tmp[(d + 1) % 2])); - pre /= 1; - post *= Q_1d; - } - } - } - break; - } - default: - // Nothing to do, excluded above - break; - } - CeedCall(CeedVectorRestoreArray(basis->vec_chebyshev, &chebyshev_coeffs)); - CeedCall(CeedVectorRestoreArrayRead(x_ref, &x_array_read)); - CeedCall(CeedVectorRestoreArrayRead(u, &u_array)); + @return An error code: 0 - success, otherwise - failure - // -- Interpolate transpose from Chebyshev coefficients - CeedCall(CeedBasisApply(basis->basis_chebyshev, 1, CEED_TRANSPOSE, CEED_EVAL_INTERP, basis->vec_chebyshev, v)); - break; - } + @ref User +**/ +int CeedBasisApplyAddAtPoints(CeedBasis basis, CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, CeedEvalMode eval_mode, + CeedVector x_ref, CeedVector u, CeedVector v) { + CeedCheck(t_mode == CEED_TRANSPOSE, CeedBasisReturnCeed(basis), CEED_ERROR_UNSUPPORTED, "CeedBasisApplyAddAtPoints only supports CEED_TRANSPOSE"); + CeedCall(CeedBasisApplyAtPointsCheckDims(basis, num_elem, num_points, t_mode, eval_mode, x_ref, u, v)); + if (basis->ApplyAddAtPoints) { + CeedCall(basis->ApplyAddAtPoints(basis, num_elem, num_points, t_mode, eval_mode, x_ref, u, v)); + } else { + CeedCall(CeedBasisApplyAtPoints_Core(basis, true, num_elem, num_points, t_mode, eval_mode, x_ref, u, v)); } return CEED_ERROR_SUCCESS; } @@ -1781,20 +2167,20 @@ int CeedBasisApplyAtPoints(CeedBasis basis, CeedInt num_points, CeedTransposeMod @ref Advanced **/ int CeedBasisGetCeed(CeedBasis basis, Ceed *ceed) { - *ceed = CeedBasisReturnCeed(basis); + CeedCall(CeedObjectGetCeed((CeedObject)basis, ceed)); return CEED_ERROR_SUCCESS; } /** @brief Return the `Ceed` associated with a `CeedBasis` - @param[in] basis `CeedBasis` + @param[in] basis `CeedBasis` @return `Ceed` associated with the `basis` @ref Advanced **/ -Ceed CeedBasisReturnCeed(CeedBasis basis) { return basis->ceed; } +Ceed CeedBasisReturnCeed(CeedBasis basis) { return CeedObjectReturnCeed((CeedObject)basis); } /** @brief Get dimension for given `CeedBasis` @@ -2073,7 +2459,7 @@ int CeedBasisGetCurl(CeedBasis basis, const CeedScalar **curl) { } /** - @brief Destroy a @ref CeedBasis + @brief Destroy a @ref CeedBasis @param[in,out] basis `CeedBasis` to destroy @@ -2082,7 +2468,7 @@ int CeedBasisGetCurl(CeedBasis basis, const CeedScalar **curl) { @ref User **/ int CeedBasisDestroy(CeedBasis *basis) { - if (!*basis || *basis == CEED_BASIS_NONE || --(*basis)->ref_count > 0) { + if (!*basis || *basis == CEED_BASIS_NONE || CeedObjectDereference((CeedObject)*basis) > 0) { *basis = NULL; return CEED_ERROR_SUCCESS; } @@ -2098,7 +2484,7 @@ int CeedBasisDestroy(CeedBasis *basis) { CeedCall(CeedFree(&(*basis)->curl)); CeedCall(CeedVectorDestroy(&(*basis)->vec_chebyshev)); CeedCall(CeedBasisDestroy(&(*basis)->basis_chebyshev)); - CeedCall(CeedDestroy(&(*basis)->ceed)); + CeedCall(CeedObjectDestroy_Private(&(*basis)->obj)); CeedCall(CeedFree(basis)); return CEED_ERROR_SUCCESS; } diff --git a/interface/ceed-config.c b/interface/ceed-config.c new file mode 100644 index 0000000000..37ae708ec7 --- /dev/null +++ b/interface/ceed-config.c @@ -0,0 +1,90 @@ +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. +// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +// +// SPDX-License-Identifier: BSD-2-Clause +// +// This file is part of CEED: http://github.com/ceed + +#include + +const char *CeedGitVersion = CEED_GIT_VERSION; +const char *CeedBuildConfiguration = CEED_BUILD_CONFIGURATION; + +/// @addtogroup CeedUser +/// @{ + +/** + @brief Get output of `git describe --dirty` from build time. + + While @ref CeedGetVersion() uniquely identifies the source code for release + builds, it does not identify builds from other commits. + + @param[out] git_version A static string containing the Git commit description. + + If `git describe --always --dirty` fails, the string `"unknown"` will be provided. + This could occur if Git is not installed or if libCEED is not being built from a repository, for example.` + + @ref Developer + + @sa CeedGetVersion() CeedGetBuildConfiguration() + + @return An error code: 0 - success, otherwise - failure +*/ +int CeedGetGitVersion(const char **git_version) { + *git_version = CeedGitVersion; + return CEED_ERROR_SUCCESS; +} + +/** + @brief Set whether or not to use Clang when compiling for GPU (instead of nvrtc) + + @param[in,out] ceed `Ceed` context to set Clang GPU compilation flag + @param[in] is_clang Flag to use clang for GPU compilation + + @ref Developer + + @sa CeedGetIsClang() + + @return An error code: 0 - success, otherwise - failure + */ +int CeedSetIsClang(Ceed ceed, bool is_clang) { + ceed->cuda_compile_with_clang = is_clang; + return CEED_ERROR_SUCCESS; +} + +/** + @brief Determine if the current `ceed` is set to compile with Clang for CPU + + @param[in] ceed `Ceed` context to get Clang GPU compilation flag + @param[out] is_clang Variable to store Clang GPU compilation flag + + @ref Developer + + @sa CeedSetIsClang() + + @return An error code: 0 - success, otherwise - failure + */ +int CeedGetIsClang(Ceed ceed, bool *is_clang) { + *is_clang = ceed->cuda_compile_with_clang; + return CEED_ERROR_SUCCESS; +} + +/** + @brief Get build variables as a multi-line string. + + Each line of the string has the format `VARNAME = value`. + + @param[out] build_config A static string containing build variables + + @ref Developer + + @sa CeedGetVersion() CeedGetGitVersion() + + @return An error code: 0 - success, otherwise - failure +*/ +int CeedGetBuildConfiguration(const char **build_config) { + *build_config = CeedBuildConfiguration; + return CEED_ERROR_SUCCESS; +} + +/// @} diff --git a/interface/ceed-cuda.c b/interface/ceed-cuda.c index c4463b738d..ea15d46735 100644 --- a/interface/ceed-cuda.c +++ b/interface/ceed-cuda.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -23,10 +23,7 @@ **/ int CeedQFunctionSetCUDAUserFunction(CeedQFunction qf, CUfunction f) { if (!qf->SetCUDAUserFunction) { - Ceed ceed; - - CeedCall(CeedQFunctionGetCeed(qf, &ceed)); - CeedDebug(ceed, "Backend does not support CUfunction pointers for QFunctions."); + CeedDebug(CeedQFunctionReturnCeed(qf), "Backend does not support CUfunction pointers for QFunctions."); } else { CeedCall(qf->SetCUDAUserFunction(qf, f)); } diff --git a/interface/ceed-elemrestriction.c b/interface/ceed-elemrestriction.c index e687c0daed..476daab0c2 100644 --- a/interface/ceed-elemrestriction.c +++ b/interface/ceed-elemrestriction.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -98,6 +98,35 @@ int CeedPermutePadCurlOrients(const CeedInt8 *curl_orients, CeedInt8 *block_curl return CEED_ERROR_SUCCESS; } +/** + @brief View a `CeedElemRestriction` passed as a `CeedObject` + + @param[in] rstr `CeedElemRestriction` to view + @param[in] stream Filestream to write to + + @return An error code: 0 - success, otherwise - failure + + @ref Developer +**/ +static int CeedElemRestrictionView_Object(CeedObject rstr, FILE *stream) { + CeedCall(CeedElemRestrictionView((CeedElemRestriction)rstr, stream)); + return CEED_ERROR_SUCCESS; +} + +/** + @brief Destroy a `CeedElemRestricton` passed as a `CeedObject` + + @param[in,out] rstr Address of `CeedElemRestriction` to destroy + + @return An error code: 0 - success, otherwise - failure + + @ref Developer +**/ +static int CeedElemRestrictionDestroy_Object(CeedObject *rstr) { + CeedCall(CeedElemRestrictionDestroy((CeedElemRestriction *)rstr)); + return CEED_ERROR_SUCCESS; +} + /// @} /// ---------------------------------------------------------------------------- @@ -146,7 +175,7 @@ int CeedElemRestrictionIsStrided(CeedElemRestriction rstr, bool *is_strided) { @ref Backend **/ -int CeedElemRestrictionIsPoints(CeedElemRestriction rstr, bool *is_points) { +int CeedElemRestrictionIsAtPoints(CeedElemRestriction rstr, bool *is_points) { *is_points = (rstr->rstr_type == CEED_RESTRICTION_POINTS); return CEED_ERROR_SUCCESS; } @@ -164,13 +193,12 @@ int CeedElemRestrictionIsPoints(CeedElemRestriction rstr, bool *is_points) { **/ int CeedElemRestrictionAtPointsAreCompatible(CeedElemRestriction rstr_a, CeedElemRestriction rstr_b, bool *are_compatible) { CeedInt num_elem_a, num_elem_b, num_points_a, num_points_b; - Ceed ceed; - - CeedCall(CeedElemRestrictionGetCeed(rstr_a, &ceed)); // Cannot compare non-points restrictions - CeedCheck(rstr_a->rstr_type == CEED_RESTRICTION_POINTS, ceed, CEED_ERROR_UNSUPPORTED, "First CeedElemRestriction must be AtPoints"); - CeedCheck(rstr_b->rstr_type == CEED_RESTRICTION_POINTS, ceed, CEED_ERROR_UNSUPPORTED, "Second CeedElemRestriction must be AtPoints"); + CeedCheck(rstr_a->rstr_type == CEED_RESTRICTION_POINTS, CeedElemRestrictionReturnCeed(rstr_a), CEED_ERROR_UNSUPPORTED, + "First CeedElemRestriction must be AtPoints"); + CeedCheck(rstr_b->rstr_type == CEED_RESTRICTION_POINTS, CeedElemRestrictionReturnCeed(rstr_a), CEED_ERROR_UNSUPPORTED, + "Second CeedElemRestriction must be AtPoints"); CeedCall(CeedElemRestrictionGetNumElements(rstr_a, &num_elem_a)); CeedCall(CeedElemRestrictionGetNumElements(rstr_b, &num_elem_b)); @@ -243,7 +271,7 @@ int CeedElemRestrictionGetOffsets(CeedElemRestriction rstr, CeedMemType mem_type CeedCall(CeedElemRestrictionGetOffsets(rstr->rstr_base, mem_type, offsets)); } else { CeedCheck(rstr->GetOffsets, CeedElemRestrictionReturnCeed(rstr), CEED_ERROR_UNSUPPORTED, - "Backend does not support CeedElemRestrictionGetOffsets"); + "Backend does not implement CeedElemRestrictionGetOffsets"); CeedCall(rstr->GetOffsets(rstr, mem_type, offsets)); rstr->num_readers++; } @@ -284,7 +312,7 @@ int CeedElemRestrictionRestoreOffsets(CeedElemRestriction rstr, const CeedInt ** **/ int CeedElemRestrictionGetOrientations(CeedElemRestriction rstr, CeedMemType mem_type, const bool **orients) { CeedCheck(rstr->GetOrientations, CeedElemRestrictionReturnCeed(rstr), CEED_ERROR_UNSUPPORTED, - "Backend does not support CeedElemRestrictionGetOrientations"); + "Backend does not implement CeedElemRestrictionGetOrientations"); CeedCall(rstr->GetOrientations(rstr, mem_type, orients)); rstr->num_readers++; return CEED_ERROR_SUCCESS; @@ -320,7 +348,7 @@ int CeedElemRestrictionRestoreOrientations(CeedElemRestriction rstr, const bool **/ int CeedElemRestrictionGetCurlOrientations(CeedElemRestriction rstr, CeedMemType mem_type, const CeedInt8 **curl_orients) { CeedCheck(rstr->GetCurlOrientations, CeedElemRestrictionReturnCeed(rstr), CEED_ERROR_UNSUPPORTED, - "Backend does not support CeedElemRestrictionGetCurlOrientations"); + "Backend does not implement CeedElemRestrictionGetCurlOrientations"); CeedCall(rstr->GetCurlOrientations(rstr, mem_type, curl_orients)); rstr->num_readers++; return CEED_ERROR_SUCCESS; @@ -357,14 +385,13 @@ int CeedElemRestrictionRestoreCurlOrientations(CeedElemRestriction rstr, const C int CeedElemRestrictionGetLLayout(CeedElemRestriction rstr, CeedInt layout[3]) { bool has_backend_strides; CeedRestrictionType rstr_type; - Ceed ceed; - CeedCall(CeedElemRestrictionGetCeed(rstr, &ceed)); CeedCall(CeedElemRestrictionGetType(rstr, &rstr_type)); - CeedCheck(rstr_type == CEED_RESTRICTION_STRIDED, ceed, CEED_ERROR_MINOR, "Only strided CeedElemRestriction have strided L-vector layout"); + CeedCheck(rstr_type == CEED_RESTRICTION_STRIDED, CeedElemRestrictionReturnCeed(rstr), CEED_ERROR_MINOR, + "Only strided CeedElemRestriction have strided L-vector layout"); CeedCall(CeedElemRestrictionHasBackendStrides(rstr, &has_backend_strides)); if (has_backend_strides) { - CeedCheck(rstr->l_layout[0], ceed, CEED_ERROR_MINOR, "CeedElemRestriction has no L-vector layout data"); + CeedCheck(rstr->l_layout[0], CeedElemRestrictionReturnCeed(rstr), CEED_ERROR_MINOR, "CeedElemRestriction has no L-vector layout data"); for (CeedInt i = 0; i < 3; i++) layout[i] = rstr->l_layout[i]; } else { CeedCall(CeedElemRestrictionGetStrides(rstr, layout)); @@ -429,6 +456,70 @@ int CeedElemRestrictionSetELayout(CeedElemRestriction rstr, CeedInt layout[3]) { return CEED_ERROR_SUCCESS; } +/** + + @brief Get the E-vector element offset of a `CeedElemRestriction` at points + + @param[in] rstr `CeedElemRestriction` + @param[in] elem Element number index into E-vector for + @param[out] elem_offset Offset for element `elem` in the E-vector. + The data for point `i`, component `j`, element `elem` in the E-vector is given by `i*e_layout[0] + j*e_layout[1] + elem_offset`. + + @return An error code: 0 - success, otherwise - failure + + @ref Backend +**/ +int CeedElemRestrictionGetAtPointsElementOffset(CeedElemRestriction rstr, CeedInt elem, CeedSize *elem_offset) { + CeedInt num_comp; + CeedRestrictionType rstr_type; + + CeedCall(CeedElemRestrictionGetType(rstr, &rstr_type)); + CeedCheck(rstr_type == CEED_RESTRICTION_POINTS, CeedElemRestrictionReturnCeed(rstr), CEED_ERROR_INCOMPATIBLE, + "Can only compute offset for a points CeedElemRestriction"); + + // Backend method + if (rstr->GetAtPointsElementOffset) { + CeedCall(rstr->GetAtPointsElementOffset(rstr, elem, elem_offset)); + return CEED_ERROR_SUCCESS; + } + + // Default layout (CPU) + *elem_offset = 0; + CeedCall(CeedElemRestrictionGetNumComponents(rstr, &num_comp)); + for (CeedInt i = 0; i < elem; i++) { + CeedInt num_points; + + CeedCall(CeedElemRestrictionGetNumPointsInElement(rstr, i, &num_points)); + *elem_offset += num_points * num_comp; + } + return CEED_ERROR_SUCCESS; +} + +/** + + @brief Set the E-vector size of a `CeedElemRestriction` at points + + @param[in,out] rstr `CeedElemRestriction` + @param[in] e_size New E-vector size; must be longer than the current E-vector size + + @return An error code: 0 - success, otherwise - failure + + @ref Backend +**/ +int CeedElemRestrictionSetAtPointsEVectorSize(CeedElemRestriction rstr, CeedSize e_size) { + CeedRestrictionType rstr_type; + + CeedCall(CeedElemRestrictionGetType(rstr, &rstr_type)); + CeedCheck(rstr_type == CEED_RESTRICTION_POINTS, CeedElemRestrictionReturnCeed(rstr), CEED_ERROR_INCOMPATIBLE, + "Can only compute offset for a points CeedElemRestriction"); + CeedCheck(e_size >= rstr->e_size, CeedElemRestrictionReturnCeed(rstr), CEED_ERROR_INCOMPATIBLE, + "Can only increase the size of the E-vector for the CeedElemRestriction." + " Current size: %" CeedSize_FMT " New size: %" CeedSize_FMT, + rstr->e_size, e_size); + rstr->e_size = e_size; + return CEED_ERROR_SUCCESS; +} + /** @brief Get the backend data of a `CeedElemRestriction` @@ -469,7 +560,7 @@ int CeedElemRestrictionSetData(CeedElemRestriction rstr, void *data) { @ref Backend **/ int CeedElemRestrictionReference(CeedElemRestriction rstr) { - rstr->ref_count++; + CeedCall(CeedObjectReference((CeedObject)rstr)); return CEED_ERROR_SUCCESS; } @@ -571,6 +662,7 @@ int CeedElemRestrictionCreate(Ceed ceed, CeedInt num_elem, CeedInt elem_size, Ce CeedCall(CeedGetObjectDelegate(ceed, &delegate, "ElemRestriction")); CeedCheck(delegate, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement CeedElemRestrictionCreate"); CeedCall(CeedElemRestrictionCreate(delegate, num_elem, elem_size, num_comp, comp_stride, l_size, mem_type, copy_mode, offsets, rstr)); + CeedCall(CeedDestroy(&delegate)); return CEED_ERROR_SUCCESS; } @@ -580,8 +672,7 @@ int CeedElemRestrictionCreate(Ceed ceed, CeedInt num_elem, CeedInt elem_size, Ce CeedCheck(num_comp == 1 || comp_stride > 0, ceed, CEED_ERROR_DIMENSION, "CeedElemRestriction component stride must be at least 1"); CeedCall(CeedCalloc(1, rstr)); - CeedCall(CeedReferenceCopy(ceed, &(*rstr)->ceed)); - (*rstr)->ref_count = 1; + CeedCall(CeedObjectCreate(ceed, CeedElemRestrictionView_Object, CeedElemRestrictionDestroy_Object, &(*rstr)->obj)); (*rstr)->num_elem = num_elem; (*rstr)->elem_size = elem_size; (*rstr)->num_comp = num_comp; @@ -626,8 +717,9 @@ int CeedElemRestrictionCreateOriented(Ceed ceed, CeedInt num_elem, CeedInt elem_ CeedCall(CeedGetObjectDelegate(ceed, &delegate, "ElemRestriction")); CeedCheck(delegate, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement CeedElemRestrictionCreateOriented"); - CeedCall( - CeedElemRestrictionCreateOriented(delegate, num_elem, elem_size, num_comp, comp_stride, l_size, mem_type, copy_mode, offsets, orients, rstr)); + CeedCall(CeedElemRestrictionCreateOriented(delegate, num_elem, elem_size, num_comp, comp_stride, l_size, mem_type, copy_mode, offsets, orients, + rstr)); + CeedCall(CeedDestroy(&delegate)); return CEED_ERROR_SUCCESS; } @@ -637,8 +729,7 @@ int CeedElemRestrictionCreateOriented(Ceed ceed, CeedInt num_elem, CeedInt elem_ CeedCheck(num_comp == 1 || comp_stride > 0, ceed, CEED_ERROR_DIMENSION, "CeedElemRestriction component stride must be at least 1"); CeedCall(CeedCalloc(1, rstr)); - CeedCall(CeedReferenceCopy(ceed, &(*rstr)->ceed)); - (*rstr)->ref_count = 1; + CeedCall(CeedObjectCreate(ceed, CeedElemRestrictionView_Object, CeedElemRestrictionDestroy_Object, &(*rstr)->obj)); (*rstr)->num_elem = num_elem; (*rstr)->elem_size = elem_size; (*rstr)->num_comp = num_comp; @@ -686,6 +777,7 @@ int CeedElemRestrictionCreateCurlOriented(Ceed ceed, CeedInt num_elem, CeedInt e CeedCheck(delegate, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement CeedElemRestrictionCreateCurlOriented"); CeedCall(CeedElemRestrictionCreateCurlOriented(delegate, num_elem, elem_size, num_comp, comp_stride, l_size, mem_type, copy_mode, offsets, curl_orients, rstr)); + CeedCall(CeedDestroy(&delegate)); return CEED_ERROR_SUCCESS; } @@ -695,8 +787,7 @@ int CeedElemRestrictionCreateCurlOriented(Ceed ceed, CeedInt num_elem, CeedInt e CeedCheck(num_comp == 1 || comp_stride > 0, ceed, CEED_ERROR_DIMENSION, "CeedElemRestriction component stride must be at least 1"); CeedCall(CeedCalloc(1, rstr)); - CeedCall(CeedReferenceCopy(ceed, &(*rstr)->ceed)); - (*rstr)->ref_count = 1; + CeedCall(CeedObjectCreate(ceed, CeedElemRestrictionView_Object, CeedElemRestrictionDestroy_Object, &(*rstr)->obj)); (*rstr)->num_elem = num_elem; (*rstr)->elem_size = elem_size; (*rstr)->num_comp = num_comp; @@ -738,18 +829,19 @@ int CeedElemRestrictionCreateStrided(Ceed ceed, CeedInt num_elem, CeedInt elem_s CeedCall(CeedGetObjectDelegate(ceed, &delegate, "ElemRestriction")); CeedCheck(delegate, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement CeedElemRestrictionCreateStrided"); CeedCall(CeedElemRestrictionCreateStrided(delegate, num_elem, elem_size, num_comp, l_size, strides, rstr)); + CeedCall(CeedDestroy(&delegate)); return CEED_ERROR_SUCCESS; } CeedCheck(num_elem >= 0, ceed, CEED_ERROR_DIMENSION, "Number of elements must be non-negative"); CeedCheck(elem_size > 0, ceed, CEED_ERROR_DIMENSION, "Element size must be at least 1"); CeedCheck(num_comp > 0, ceed, CEED_ERROR_DIMENSION, "CeedElemRestriction must have at least 1 component"); - CeedCheck(l_size >= (CeedSize)num_elem * elem_size * num_comp, ceed, CEED_ERROR_DIMENSION, - "L-vector size must be at least num_elem * elem_size * num_comp"); + CeedCheck(l_size >= (CeedSize)num_elem * (CeedSize)elem_size * (CeedSize)num_comp, ceed, CEED_ERROR_DIMENSION, + "L-vector size must be at least num_elem * elem_size * num_comp. Expected: > %" CeedSize_FMT " Found: %" CeedSize_FMT, + (CeedSize)num_elem * (CeedSize)elem_size * (CeedSize)num_comp, l_size); CeedCall(CeedCalloc(1, rstr)); - CeedCall(CeedReferenceCopy(ceed, &(*rstr)->ceed)); - (*rstr)->ref_count = 1; + CeedCall(CeedObjectCreate(ceed, CeedElemRestrictionView_Object, CeedElemRestrictionDestroy_Object, &(*rstr)->obj)); (*rstr)->num_elem = num_elem; (*rstr)->elem_size = elem_size; (*rstr)->num_comp = num_comp; @@ -804,25 +896,28 @@ int CeedElemRestrictionCreateAtPoints(Ceed ceed, CeedInt num_elem, CeedInt num_p CeedCall(CeedGetObjectDelegate(ceed, &delegate, "ElemRestriction")); CeedCheck(delegate, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement CeedElemRestrictionCreateAtPoints"); CeedCall(CeedElemRestrictionCreateAtPoints(delegate, num_elem, num_points, num_comp, l_size, mem_type, copy_mode, offsets, rstr)); + CeedCall(CeedDestroy(&delegate)); return CEED_ERROR_SUCCESS; } CeedCheck(num_elem >= 0, ceed, CEED_ERROR_DIMENSION, "Number of elements must be non-negative"); CeedCheck(num_points >= 0, ceed, CEED_ERROR_DIMENSION, "Number of points must be non-negative"); CeedCheck(num_comp > 0, ceed, CEED_ERROR_DIMENSION, "CeedElemRestriction must have at least 1 component"); - CeedCheck(l_size >= (CeedSize)num_points * num_comp, ceed, CEED_ERROR_DIMENSION, "L-vector must be at least num_points * num_comp"); + CeedCheck(l_size >= (CeedSize)num_points * num_comp, ceed, CEED_ERROR_DIMENSION, + "L-vector must be at least num_points * num_comp. Expected: > %" CeedSize_FMT " Found: %" CeedSize_FMT, (CeedSize)num_points * num_comp, + l_size); CeedCall(CeedCalloc(1, rstr)); - CeedCall(CeedReferenceCopy(ceed, &(*rstr)->ceed)); - (*rstr)->ref_count = 1; - (*rstr)->num_elem = num_elem; - (*rstr)->num_points = num_points; - (*rstr)->num_comp = num_comp; - (*rstr)->l_size = l_size; - (*rstr)->e_size = (CeedSize)num_points * (CeedSize)num_comp; - (*rstr)->num_block = num_elem; - (*rstr)->block_size = 1; - (*rstr)->rstr_type = CEED_RESTRICTION_POINTS; + CeedCall(CeedObjectCreate(ceed, CeedElemRestrictionView_Object, CeedElemRestrictionDestroy_Object, &(*rstr)->obj)); + (*rstr)->num_elem = num_elem; + (*rstr)->num_points = num_points; + (*rstr)->num_comp = num_comp; + (*rstr)->comp_stride = 1; + (*rstr)->l_size = l_size; + (*rstr)->e_size = (CeedSize)num_points * (CeedSize)num_comp; + (*rstr)->num_block = num_elem; + (*rstr)->block_size = 1; + (*rstr)->rstr_type = CEED_RESTRICTION_POINTS; CeedCall(ceed->ElemRestrictionCreateAtPoints(mem_type, copy_mode, offsets, NULL, NULL, *rstr)); return CEED_ERROR_SUCCESS; } @@ -864,6 +959,7 @@ int CeedElemRestrictionCreateBlocked(Ceed ceed, CeedInt num_elem, CeedInt elem_s CeedCheck(delegate, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement CeedElemRestrictionCreateBlocked"); CeedCall(CeedElemRestrictionCreateBlocked(delegate, num_elem, elem_size, block_size, num_comp, comp_stride, l_size, mem_type, copy_mode, offsets, rstr)); + CeedCall(CeedDestroy(&delegate)); return CEED_ERROR_SUCCESS; } @@ -877,8 +973,7 @@ int CeedElemRestrictionCreateBlocked(Ceed ceed, CeedInt num_elem, CeedInt elem_s CeedCall(CeedPermutePadOffsets(offsets, block_offsets, num_block, num_elem, block_size, elem_size)); CeedCall(CeedCalloc(1, rstr)); - CeedCall(CeedReferenceCopy(ceed, &(*rstr)->ceed)); - (*rstr)->ref_count = 1; + CeedCall(CeedObjectCreate(ceed, CeedElemRestrictionView_Object, CeedElemRestrictionDestroy_Object, &(*rstr)->obj)); (*rstr)->num_elem = num_elem; (*rstr)->elem_size = elem_size; (*rstr)->num_comp = num_comp; @@ -933,6 +1028,7 @@ int CeedElemRestrictionCreateBlockedOriented(Ceed ceed, CeedInt num_elem, CeedIn CeedCheck(delegate, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement CeedElemRestrictionCreateBlockedOriented"); CeedCall(CeedElemRestrictionCreateBlockedOriented(delegate, num_elem, elem_size, block_size, num_comp, comp_stride, l_size, mem_type, copy_mode, offsets, orients, rstr)); + CeedCall(CeedDestroy(&delegate)); return CEED_ERROR_SUCCESS; } @@ -947,8 +1043,7 @@ int CeedElemRestrictionCreateBlockedOriented(Ceed ceed, CeedInt num_elem, CeedIn CeedCall(CeedPermutePadOrients(orients, block_orients, num_block, num_elem, block_size, elem_size)); CeedCall(CeedCalloc(1, rstr)); - CeedCall(CeedReferenceCopy(ceed, &(*rstr)->ceed)); - (*rstr)->ref_count = 1; + CeedCall(CeedObjectCreate(ceed, CeedElemRestrictionView_Object, CeedElemRestrictionDestroy_Object, &(*rstr)->obj)); (*rstr)->num_elem = num_elem; (*rstr)->elem_size = elem_size; (*rstr)->num_comp = num_comp; @@ -958,8 +1053,8 @@ int CeedElemRestrictionCreateBlockedOriented(Ceed ceed, CeedInt num_elem, CeedIn (*rstr)->num_block = num_block; (*rstr)->block_size = block_size; (*rstr)->rstr_type = CEED_RESTRICTION_ORIENTED; - CeedCall( - ceed->ElemRestrictionCreateBlocked(CEED_MEM_HOST, CEED_OWN_POINTER, (const CeedInt *)block_offsets, (const bool *)block_orients, NULL, *rstr)); + CeedCall(ceed->ElemRestrictionCreateBlocked(CEED_MEM_HOST, CEED_OWN_POINTER, (const CeedInt *)block_offsets, (const bool *)block_orients, NULL, + *rstr)); if (copy_mode == CEED_OWN_POINTER) CeedCall(CeedFree(&offsets)); return CEED_ERROR_SUCCESS; } @@ -1005,6 +1100,7 @@ int CeedElemRestrictionCreateBlockedCurlOriented(Ceed ceed, CeedInt num_elem, Ce CeedCheck(delegate, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement CeedElemRestrictionCreateBlockedCurlOriented"); CeedCall(CeedElemRestrictionCreateBlockedCurlOriented(delegate, num_elem, elem_size, block_size, num_comp, comp_stride, l_size, mem_type, copy_mode, offsets, curl_orients, rstr)); + CeedCall(CeedDestroy(&delegate)); return CEED_ERROR_SUCCESS; } @@ -1020,8 +1116,7 @@ int CeedElemRestrictionCreateBlockedCurlOriented(Ceed ceed, CeedInt num_elem, Ce CeedCall(CeedPermutePadCurlOrients(curl_orients, block_curl_orients, num_block, num_elem, block_size, 3 * elem_size)); CeedCall(CeedCalloc(1, rstr)); - CeedCall(CeedReferenceCopy(ceed, &(*rstr)->ceed)); - (*rstr)->ref_count = 1; + CeedCall(CeedObjectCreate(ceed, CeedElemRestrictionView_Object, CeedElemRestrictionDestroy_Object, &(*rstr)->obj)); (*rstr)->num_elem = num_elem; (*rstr)->elem_size = elem_size; (*rstr)->num_comp = num_comp; @@ -1066,6 +1161,7 @@ int CeedElemRestrictionCreateBlockedStrided(Ceed ceed, CeedInt num_elem, CeedInt CeedCall(CeedGetObjectDelegate(ceed, &delegate, "ElemRestriction")); CeedCheck(delegate, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement CeedElemRestrictionCreateBlockedStrided"); CeedCall(CeedElemRestrictionCreateBlockedStrided(delegate, num_elem, elem_size, block_size, num_comp, l_size, strides, rstr)); + CeedCall(CeedDestroy(&delegate)); return CEED_ERROR_SUCCESS; } @@ -1073,12 +1169,12 @@ int CeedElemRestrictionCreateBlockedStrided(Ceed ceed, CeedInt num_elem, CeedInt CeedCheck(elem_size > 0, ceed, CEED_ERROR_DIMENSION, "Element size must be at least 1"); CeedCheck(block_size > 0, ceed, CEED_ERROR_DIMENSION, "Block size must be at least 1"); CeedCheck(num_comp > 0, ceed, CEED_ERROR_DIMENSION, "CeedElemRestriction must have at least 1 component"); - CeedCheck(l_size >= (CeedSize)num_elem * elem_size * num_comp, ceed, CEED_ERROR_DIMENSION, - "L-vector size must be at least num_elem * elem_size * num_comp"); + CeedCheck(l_size >= (CeedSize)num_elem * (CeedSize)elem_size * (CeedSize)num_comp, ceed, CEED_ERROR_DIMENSION, + "L-vector size must be at least num_elem * elem_size * num_comp. Expected: > %" CeedSize_FMT " Found: %" CeedSize_FMT, + (CeedSize)num_elem * (CeedSize)elem_size * (CeedSize)num_comp, l_size); CeedCall(CeedCalloc(1, rstr)); - CeedCall(CeedReferenceCopy(ceed, &(*rstr)->ceed)); - (*rstr)->ref_count = 1; + CeedCall(CeedObjectCreate(ceed, CeedElemRestrictionView_Object, CeedElemRestrictionDestroy_Object, &(*rstr)->obj)); (*rstr)->num_elem = num_elem; (*rstr)->elem_size = elem_size; (*rstr)->num_comp = num_comp; @@ -1110,10 +1206,9 @@ int CeedElemRestrictionCreateUnsignedCopy(CeedElemRestriction rstr, CeedElemRest // Copy old rstr memcpy(*rstr_unsigned, rstr, sizeof(struct CeedElemRestriction_private)); - (*rstr_unsigned)->ceed = NULL; - CeedCall(CeedReferenceCopy(rstr->ceed, &(*rstr_unsigned)->ceed)); - (*rstr_unsigned)->ref_count = 1; - (*rstr_unsigned)->strides = NULL; + CeedCall(CeedObjectCreate(CeedElemRestrictionReturnCeed(rstr), CeedElemRestrictionView_Object, CeedElemRestrictionDestroy_Object, + &(*rstr_unsigned)->obj)); + (*rstr_unsigned)->strides = NULL; if (rstr->strides) { CeedCall(CeedMalloc(3, &(*rstr_unsigned)->strides)); for (CeedInt i = 0; i < 3; i++) (*rstr_unsigned)->strides[i] = rstr->strides[i]; @@ -1142,10 +1237,9 @@ int CeedElemRestrictionCreateUnorientedCopy(CeedElemRestriction rstr, CeedElemRe // Copy old rstr memcpy(*rstr_unoriented, rstr, sizeof(struct CeedElemRestriction_private)); - (*rstr_unoriented)->ceed = NULL; - CeedCall(CeedReferenceCopy(rstr->ceed, &(*rstr_unoriented)->ceed)); - (*rstr_unoriented)->ref_count = 1; - (*rstr_unoriented)->strides = NULL; + CeedCall(CeedObjectCreate(CeedElemRestrictionReturnCeed(rstr), CeedElemRestrictionView_Object, CeedElemRestrictionDestroy_Object, + &(*rstr_unoriented)->obj)); + (*rstr_unoriented)->strides = NULL; if (rstr->strides) { CeedCall(CeedMalloc(3, &(*rstr_unoriented)->strides)); for (CeedInt i = 0; i < 3; i++) (*rstr_unoriented)->strides[i] = rstr->strides[i]; @@ -1199,6 +1293,7 @@ int CeedElemRestrictionCreateVector(CeedElemRestriction rstr, CeedVector *l_vec, CeedCall(CeedElemRestrictionGetEVectorSize(rstr, &e_size)); if (l_vec) CeedCall(CeedVectorCreate(ceed, l_size, l_vec)); if (e_vec) CeedCall(CeedVectorCreate(ceed, e_size, e_vec)); + CeedCall(CeedDestroy(&ceed)); return CEED_ERROR_SUCCESS; } @@ -1219,9 +1314,7 @@ int CeedElemRestrictionCreateVector(CeedElemRestriction rstr, CeedVector *l_vec, int CeedElemRestrictionApply(CeedElemRestriction rstr, CeedTransposeMode t_mode, CeedVector u, CeedVector ru, CeedRequest *request) { CeedSize min_u_len, min_ru_len, len; CeedInt num_elem; - Ceed ceed; - CeedCall(CeedElemRestrictionGetCeed(rstr, &ceed)); if (t_mode == CEED_NOTRANSPOSE) { CeedCall(CeedElemRestrictionGetEVectorSize(rstr, &min_ru_len)); CeedCall(CeedElemRestrictionGetLVectorSize(rstr, &min_u_len)); @@ -1230,11 +1323,11 @@ int CeedElemRestrictionApply(CeedElemRestriction rstr, CeedTransposeMode t_mode, CeedCall(CeedElemRestrictionGetLVectorSize(rstr, &min_ru_len)); } CeedCall(CeedVectorGetLength(u, &len)); - CeedCheck(min_u_len <= len, ceed, CEED_ERROR_DIMENSION, + CeedCheck(min_u_len <= len, CeedElemRestrictionReturnCeed(rstr), CEED_ERROR_DIMENSION, "Input vector size %" CeedInt_FMT " not compatible with element restriction (%" CeedInt_FMT ", %" CeedInt_FMT ")", len, min_ru_len, min_u_len); CeedCall(CeedVectorGetLength(ru, &len)); - CeedCheck(min_ru_len <= len, ceed, CEED_ERROR_DIMENSION, + CeedCheck(min_ru_len <= len, CeedElemRestrictionReturnCeed(rstr), CEED_ERROR_DIMENSION, "Output vector size %" CeedInt_FMT " not compatible with element restriction (%" CeedInt_FMT ", %" CeedInt_FMT ")", len, min_u_len, min_ru_len); CeedCall(CeedElemRestrictionGetNumElements(rstr, &num_elem)); @@ -1261,9 +1354,10 @@ int CeedElemRestrictionApplyAtPointsInElement(CeedElemRestriction rstr, CeedInt CeedRequest *request) { CeedSize min_u_len, min_ru_len, len; CeedInt num_elem; - Ceed ceed; - CeedCall(CeedElemRestrictionGetCeed(rstr, &ceed)); + CeedCheck(rstr->ApplyAtPointsInElement, CeedElemRestrictionReturnCeed(rstr), CEED_ERROR_UNSUPPORTED, + "Backend does not implement CeedElemRestrictionApplyAtPointsInElement"); + if (t_mode == CEED_NOTRANSPOSE) { CeedInt num_points, num_comp; @@ -1280,17 +1374,17 @@ int CeedElemRestrictionApplyAtPointsInElement(CeedElemRestriction rstr, CeedInt CeedCall(CeedElemRestrictionGetLVectorSize(rstr, &min_ru_len)); } CeedCall(CeedVectorGetLength(u, &len)); - CeedCheck(min_u_len <= len, ceed, CEED_ERROR_DIMENSION, + CeedCheck(min_u_len <= len, CeedElemRestrictionReturnCeed(rstr), CEED_ERROR_DIMENSION, "Input vector size %" CeedInt_FMT " not compatible with element restriction (%" CeedInt_FMT ", %" CeedInt_FMT ") for element %" CeedInt_FMT, len, min_ru_len, min_u_len, elem); CeedCall(CeedVectorGetLength(ru, &len)); - CeedCheck(min_ru_len <= len, ceed, CEED_ERROR_DIMENSION, + CeedCheck(min_ru_len <= len, CeedElemRestrictionReturnCeed(rstr), CEED_ERROR_DIMENSION, "Output vector size %" CeedInt_FMT " not compatible with element restriction (%" CeedInt_FMT ", %" CeedInt_FMT ") for element %" CeedInt_FMT, len, min_ru_len, min_u_len, elem); CeedCall(CeedElemRestrictionGetNumElements(rstr, &num_elem)); - CeedCheck(elem < num_elem, ceed, CEED_ERROR_DIMENSION, + CeedCheck(elem < num_elem, CeedElemRestrictionReturnCeed(rstr), CEED_ERROR_DIMENSION, "Cannot retrieve element %" CeedInt_FMT ", element %" CeedInt_FMT " > total elements %" CeedInt_FMT "", elem, elem, num_elem); if (num_elem > 0) CeedCall(rstr->ApplyAtPointsInElement(rstr, elem, t_mode, u, ru, request)); return CEED_ERROR_SUCCESS; @@ -1315,10 +1409,9 @@ int CeedElemRestrictionApplyBlock(CeedElemRestriction rstr, CeedInt block, CeedT CeedRequest *request) { CeedSize min_u_len, min_ru_len, len; CeedInt block_size, num_elem; - Ceed ceed; - CeedCall(CeedElemRestrictionGetCeed(rstr, &ceed)); - CeedCheck(rstr->ApplyBlock, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement CeedElemRestrictionApplyBlock"); + CeedCheck(rstr->ApplyBlock, CeedElemRestrictionReturnCeed(rstr), CEED_ERROR_UNSUPPORTED, + "Backend does not implement CeedElemRestrictionApplyBlock"); CeedCall(CeedElemRestrictionGetBlockSize(rstr, &block_size)); if (t_mode == CEED_NOTRANSPOSE) { @@ -1337,15 +1430,15 @@ int CeedElemRestrictionApplyBlock(CeedElemRestriction rstr, CeedInt block, CeedT min_u_len = (CeedSize)block_size * (CeedSize)elem_size * (CeedSize)num_comp; } CeedCall(CeedVectorGetLength(u, &len)); - CeedCheck(min_u_len == len, ceed, CEED_ERROR_DIMENSION, + CeedCheck(min_u_len == len, CeedElemRestrictionReturnCeed(rstr), CEED_ERROR_DIMENSION, "Input vector size %" CeedInt_FMT " not compatible with element restriction (%" CeedInt_FMT ", %" CeedInt_FMT ")", len, min_u_len, min_ru_len); CeedCall(CeedVectorGetLength(ru, &len)); - CeedCheck(min_ru_len == len, ceed, CEED_ERROR_DIMENSION, + CeedCheck(min_ru_len == len, CeedElemRestrictionReturnCeed(rstr), CEED_ERROR_DIMENSION, "Output vector size %" CeedInt_FMT " not compatible with element restriction (%" CeedInt_FMT ", %" CeedInt_FMT ")", len, min_ru_len, min_u_len); CeedCall(CeedElemRestrictionGetNumElements(rstr, &num_elem)); - CeedCheck(block_size * block <= num_elem, ceed, CEED_ERROR_DIMENSION, + CeedCheck(block_size * block <= num_elem, CeedElemRestrictionReturnCeed(rstr), CEED_ERROR_DIMENSION, "Cannot retrieve block %" CeedInt_FMT ", element %" CeedInt_FMT " > total elements %" CeedInt_FMT "", block, block_size * block, num_elem); CeedCall(rstr->ApplyBlock(rstr, block, t_mode, u, ru, request)); @@ -1363,7 +1456,7 @@ int CeedElemRestrictionApplyBlock(CeedElemRestriction rstr, CeedInt block, CeedT @ref Advanced **/ int CeedElemRestrictionGetCeed(CeedElemRestriction rstr, Ceed *ceed) { - *ceed = CeedElemRestrictionReturnCeed(rstr); + CeedCall(CeedObjectGetCeed((CeedObject)rstr, ceed)); return CEED_ERROR_SUCCESS; } @@ -1376,7 +1469,7 @@ int CeedElemRestrictionGetCeed(CeedElemRestriction rstr, Ceed *ceed) { @ref Advanced **/ -Ceed CeedElemRestrictionReturnCeed(CeedElemRestriction rstr) { return rstr->ceed; } +Ceed CeedElemRestrictionReturnCeed(CeedElemRestriction rstr) { return CeedObjectReturnCeed((CeedObject)rstr); } /** @brief Get the L-vector component stride @@ -1425,10 +1518,10 @@ int CeedElemRestrictionGetElementSize(CeedElemRestriction rstr, CeedInt *elem_si /** - @brief Get the number of points in the l-vector for a points `CeedElemRestriction` + @brief Get the number of points in the offsets array for a points `CeedElemRestriction` @param[in] rstr `CeedElemRestriction` - @param[out] num_points The number of points in the l-vector + @param[out] num_points The number of points in the offsets array @return An error code: 0 - success, otherwise - failure @@ -1472,34 +1565,77 @@ int CeedElemRestrictionGetNumPointsInElement(CeedElemRestriction rstr, CeedInt e } /** - @brief Get the maximum number of points in an element for a `CeedElemRestriction` at points + @brief Get the minimum and/or maximum number of points in an element for a `CeedElemRestriction` at points @param[in] rstr `CeedElemRestriction` - @param[out] max_points Variable to store size of elements + @param[out] min_points Variable to minimum number of points in an element, or `NULL` + @param[out] max_points Variable to maximum number of points in an element, or `NULL` @return An error code: 0 - success, otherwise - failure @ref Advanced **/ -int CeedElemRestrictionGetMaxPointsInElement(CeedElemRestriction rstr, CeedInt *max_points) { - CeedInt num_elem; +int CeedElemRestrictionGetMinMaxPointsInElement(CeedElemRestriction rstr, CeedInt *min_points, CeedInt *max_points) { + CeedInt num_elem, num_points; CeedRestrictionType rstr_type; CeedCall(CeedElemRestrictionGetType(rstr, &rstr_type)); CeedCheck(rstr_type == CEED_RESTRICTION_POINTS, CeedElemRestrictionReturnCeed(rstr), CEED_ERROR_INCOMPATIBLE, - "Cannot compute max points for a CeedElemRestriction that does not use points"); + "Cannot compute min/max points for a CeedElemRestriction that does not use points"); CeedCall(CeedElemRestrictionGetNumElements(rstr, &num_elem)); - *max_points = 0; - for (CeedInt e = 0; e < num_elem; e++) { - CeedInt num_points; + // Exit early if there are no elements + if (num_elem == 0) { + if (min_points) *min_points = 0; + if (max_points) *max_points = 0; + return CEED_ERROR_SUCCESS; + } + + // Initialize to the number of points in the first element + CeedCall(CeedElemRestrictionGetNumPointsInElement(rstr, 0, &num_points)); + if (min_points) *min_points = num_points; + if (max_points) *max_points = num_points; + for (CeedInt e = 1; e < num_elem; e++) { CeedCall(CeedElemRestrictionGetNumPointsInElement(rstr, e, &num_points)); - *max_points = CeedIntMax(num_points, *max_points); + if (min_points) *min_points = CeedIntMin(num_points, *min_points); + if (max_points) *max_points = CeedIntMax(num_points, *max_points); } return CEED_ERROR_SUCCESS; } +/** + @brief Get the maximum number of points in an element for a `CeedElemRestriction` at points + + @param[in] rstr `CeedElemRestriction` + @param[out] max_points Variable to store maximum number of points in an element + + @return An error code: 0 - success, otherwise - failure + + @ref User + + @see CeedElemRestrictionGetMinMaxPointsInElement() +**/ +int CeedElemRestrictionGetMaxPointsInElement(CeedElemRestriction rstr, CeedInt *max_points) { + return CeedElemRestrictionGetMinMaxPointsInElement(rstr, NULL, max_points); +} + +/** + @brief Get the minimum number of points in an element for a `CeedElemRestriction` at points + + @param[in] rstr `CeedElemRestriction` + @param[out] min_points Variable to store minimum number of points in an element + + @return An error code: 0 - success, otherwise - failure + + @ref User + + @see CeedElemRestrictionGetMinMaxPointsInElement() +**/ +int CeedElemRestrictionGetMinPointsInElement(CeedElemRestriction rstr, CeedInt *min_points) { + return CeedElemRestrictionGetMinMaxPointsInElement(rstr, min_points, NULL); +} + /** @brief Get the size of the l-vector for a `CeedElemRestriction` @@ -1602,6 +1738,36 @@ int CeedElemRestrictionGetMultiplicity(CeedElemRestriction rstr, CeedVector mult return CEED_ERROR_SUCCESS; } +/** + @brief Set the number of tabs to indent for @ref CeedElemRestrictionView() output + + @param[in] rstr `CeedElemRestriction` to set the number of view tabs + @param[in] num_tabs Number of view tabs to set + + @return Error code: 0 - success, otherwise - failure + + @ref User +**/ +int CeedElemRestrictionSetNumViewTabs(CeedElemRestriction rstr, CeedInt num_tabs) { + CeedCall(CeedObjectSetNumViewTabs((CeedObject)rstr, num_tabs)); + return CEED_ERROR_SUCCESS; +} + +/** + @brief Get the number of tabs to indent for @ref CeedElemRestrictionView() output + + @param[in] rstr `CeedElemRestriction` to get the number of view tabs + @param[out] num_tabs Number of view tabs + + @return Error code: 0 - success, otherwise - failure + + @ref User +**/ +int CeedElemRestrictionGetNumViewTabs(CeedElemRestriction rstr, CeedInt *num_tabs) { + CeedCall(CeedObjectGetNumViewTabs((CeedObject)rstr, num_tabs)); + return CEED_ERROR_SUCCESS; +} + /** @brief View a `CeedElemRestriction` @@ -1613,17 +1779,26 @@ int CeedElemRestrictionGetMultiplicity(CeedElemRestriction rstr, CeedVector mult @ref User **/ int CeedElemRestrictionView(CeedElemRestriction rstr, FILE *stream) { + char *tabs = NULL; CeedRestrictionType rstr_type; + { + CeedInt num_tabs = 0; + + CeedCall(CeedElemRestrictionGetNumViewTabs(rstr, &num_tabs)); + CeedCall(CeedCalloc(CEED_TAB_WIDTH * num_tabs + 1, &tabs)); + for (CeedInt i = 0; i < CEED_TAB_WIDTH * num_tabs; i++) tabs[i] = ' '; + } + CeedCall(CeedElemRestrictionGetType(rstr, &rstr_type)); if (rstr_type == CEED_RESTRICTION_POINTS) { CeedInt max_points; CeedCall(CeedElemRestrictionGetMaxPointsInElement(rstr, &max_points)); fprintf(stream, - "CeedElemRestriction at points from (%" CeedSize_FMT ", %" CeedInt_FMT ") to %" CeedInt_FMT " elements with a maximum of %" CeedInt_FMT + "%sCeedElemRestriction at points from (%" CeedSize_FMT ", %" CeedInt_FMT ") to %" CeedInt_FMT " elements with a maximum of %" CeedInt_FMT " points on an element\n", - rstr->l_size, rstr->num_comp, rstr->num_elem, max_points); + tabs, rstr->l_size, rstr->num_comp, rstr->num_elem, max_points); } else { char strides_str[500]; @@ -1633,11 +1808,12 @@ int CeedElemRestrictionView(CeedElemRestriction rstr, FILE *stream) { sprintf(strides_str, "%" CeedInt_FMT, rstr->comp_stride); } fprintf(stream, - "%sCeedElemRestriction from (%" CeedSize_FMT ", %" CeedInt_FMT ") to %" CeedInt_FMT " elements with %" CeedInt_FMT + "%s%sCeedElemRestriction from (%" CeedSize_FMT ", %" CeedInt_FMT ") to %" CeedInt_FMT " elements with %" CeedInt_FMT " nodes each and %s %s\n", - rstr->block_size > 1 ? "Blocked " : "", rstr->l_size, rstr->num_comp, rstr->num_elem, rstr->elem_size, + tabs, rstr->block_size > 1 ? "Blocked " : "", rstr->l_size, rstr->num_comp, rstr->num_elem, rstr->elem_size, rstr->strides ? "strides" : "component stride", strides_str); } + CeedCall(CeedFree(&tabs)); return CEED_ERROR_SUCCESS; } @@ -1651,11 +1827,11 @@ int CeedElemRestrictionView(CeedElemRestriction rstr, FILE *stream) { @ref User **/ int CeedElemRestrictionDestroy(CeedElemRestriction *rstr) { - if (!*rstr || *rstr == CEED_ELEMRESTRICTION_NONE || --(*rstr)->ref_count > 0) { + if (!*rstr || *rstr == CEED_ELEMRESTRICTION_NONE || CeedObjectDereference((CeedObject)*rstr) > 0) { *rstr = NULL; return CEED_ERROR_SUCCESS; } - CeedCheck((*rstr)->num_readers == 0, (*rstr)->ceed, CEED_ERROR_ACCESS, + CeedCheck((*rstr)->num_readers == 0, CeedElemRestrictionReturnCeed(*rstr), CEED_ERROR_ACCESS, "Cannot destroy CeedElemRestriction, a process has read access to the offset data"); // Only destroy backend data once between rstr and unsigned copy @@ -1663,7 +1839,7 @@ int CeedElemRestrictionDestroy(CeedElemRestriction *rstr) { else if ((*rstr)->Destroy) CeedCall((*rstr)->Destroy(*rstr)); CeedCall(CeedFree(&(*rstr)->strides)); - CeedCall(CeedDestroy(&(*rstr)->ceed)); + CeedCall(CeedObjectDestroy_Private(&(*rstr)->obj)); CeedCall(CeedFree(rstr)); return CEED_ERROR_SUCCESS; } diff --git a/interface/ceed-fortran.c b/interface/ceed-fortran.c index d3ed061c3e..042d7ae014 100644 --- a/interface/ceed-fortran.c +++ b/interface/ceed-fortran.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -85,6 +85,9 @@ CEED_EXTERN void fCeedIsDeterministic(int *ceed, int *is_deterministic, int *err #define fCeedGetPreferredMemType FORTRAN_NAME(ceedgetpreferredmemtype, CEEDGETPREFERREDMEMTYPE) CEED_EXTERN void fCeedGetPreferredMemType(int *ceed, int *type, int *err) { *err = CeedGetPreferredMemType(Ceed_dict[*ceed], (CeedMemType *)type); } +#define fCeedSetNumViewTabs FORTRAN_NAME(ceedsetnumviewtabs, CEEDSETNUMVIEWTABS) +CEED_EXTERN void fCeedSetNumViewTabs(int *ceed, int *num_tabs, int *err) { *err = CeedSetNumViewTabs(Ceed_dict[*ceed], *num_tabs); } + #define fCeedView FORTRAN_NAME(ceedview, CEEDVIEW) CEED_EXTERN void fCeedView(int *ceed, int *err) { *err = CeedView(Ceed_dict[*ceed], stdout); } @@ -192,6 +195,9 @@ CEED_EXTERN void fCeedVectorNorm(int *vec, int *type, CeedScalar *norm, int *err #define fCeedVectorReciprocal FORTRAN_NAME(ceedvectorreciprocal, CEEDVECTORRECIPROCAL) CEED_EXTERN void fCeedVectorReciprocal(int *vec, int *err) { *err = CeedVectorReciprocal(CeedVector_dict[*vec]); } +#define fCeedVectorSetNumViewTabs FORTRAN_NAME(ceedvectorsetnumviewtabs, CEEDVECTORSETNUMVIEWTABS) +CEED_EXTERN void fCeedVectorSetNumViewTabs(int *vec, int *num_tabs, int *err) { *err = CeedVectorSetNumViewTabs(CeedVector_dict[*vec], *num_tabs); } + #define fCeedVectorView FORTRAN_NAME(ceedvectorview, CEEDVECTORVIEW) CEED_EXTERN void fCeedVectorView(int *vec, int *err) { *err = CeedVectorView(CeedVector_dict[*vec], "%12.8f", stdout); } @@ -449,6 +455,11 @@ CEED_EXTERN void fCeedElemRestrictionGetELayout(int *elemr, int *layout, int *er for (int i = 0; i < 3; i++) layout[i] = layout_c[i]; } +#define fCeedElemRestrictionSetNumViewTabs FORTRAN_NAME(ceedelemrestrictionsetnumviewtabs, CEEDELEMRESTRICTIONSETNUMVIEWTABS) +CEED_EXTERN void fCeedElemRestrictionSetNumViewTabs(int *elemr, int *num_tabs, int *err) { + *err = CeedElemRestrictionSetNumViewTabs(CeedElemRestriction_dict[*elemr], *num_tabs); +} + #define fCeedElemRestrictionView FORTRAN_NAME(ceedelemrestrictionview, CEEDELEMRESTRICTIONVIEW) CEED_EXTERN void fCeedElemRestrictionView(int *elemr, int *err) { *err = CeedElemRestrictionView(CeedElemRestriction_dict[*elemr], stdout); } @@ -575,6 +586,9 @@ CEED_EXTERN void fCeedBasisCreateHcurl(int *ceed, int *topo, int *num_comp, int } } +#define fCeedBasisSetNumViewTabs FORTRAN_NAME(ceedbasissetnumviewtabs, CEEDBASISSETNUMVIEWTABS) +CEED_EXTERN void fCeedBasisSetNumViewTabs(int *basis, int *num_tabs, int *err) { *err = CeedBasisSetNumViewTabs(CeedBasis_dict[*basis], *num_tabs); } + #define fCeedBasisView FORTRAN_NAME(ceedbasisview, CEEDBASISVIEW) CEED_EXTERN void fCeedBasisView(int *basis, int *err) { *err = CeedBasisView(CeedBasis_dict[*basis], stdout); } @@ -688,6 +702,11 @@ CEED_EXTERN void fCeedQFunctionContextRestoreData(int *ctx, CeedScalar *data, in *offset = 0; } +#define fCeedQFunctionContextSetNumViewTabs FORTRAN_NAME(ceedqfunctioncontextsetnumviewtabs, CEEDQFUNCTIONCONTEXTSETNUMVIEWTABS) +CEED_EXTERN void fCeedQFunctionContextSetNumViewTabs(int *ctx, int *num_tabs, int *err) { + *err = CeedQFunctionContextSetNumViewTabs(CeedQFunctionContext_dict[*ctx], *num_tabs); +} + #define fCeedQFunctionContextView FORTRAN_NAME(ceedqfunctioncontextview, CEEDQFUNCTIONCONTEXTVIEW) CEED_EXTERN void fCeedQFunctionContextView(int *ctx, int *err) { *err = CeedQFunctionContextView(CeedQFunctionContext_dict[*ctx], stdout); } @@ -845,6 +864,13 @@ CEED_EXTERN void fCeedQFunctionSetContext(int *qf, int *ctx, int *err) { if (*err) return; fctxdata->inner_ctx = ctx_; *err = CeedQFunctionContextRestoreData(fctx, (void **)&fctxdata); + if (*err) return; + *err = CeedQFunctionContextDestroy(&fctx); +} + +#define fCeedQFunctionSetNumViewTabs FORTRAN_NAME(ceedqfunctionsetnumviewtabs, CEEDQFUNCTIONSETNUMVIEWTABS) +CEED_EXTERN void fCeedQFunctionSetNumViewTabs(int *qf, int *num_tabs, int *err) { + *err = CeedQFunctionSetNumViewTabs(CeedQFunction_dict[*qf], *num_tabs); } #define fCeedQFunctionView FORTRAN_NAME(ceedqfunctionview, CEEDQFUNCTIONVIEW) @@ -949,8 +975,8 @@ CEED_EXTERN void fCeedOperatorCreate(int *ceed, int *qf, int *dqf, int *dqfT, in CeedOperator_n++; } -#define fCeedCompositeOperatorCreate FORTRAN_NAME(ceedcompositeoperatorcreate, CEEDCOMPOSITEOPERATORCREATE) -CEED_EXTERN void fCeedCompositeOperatorCreate(int *ceed, int *op, int *err) { +#define fCeedOperatorCreateComposite FORTRAN_NAME(ceedoperatorcreatecomposite, CEEDOPERATORCREATECOMPOSITE) +CEED_EXTERN void fCeedOperatorCreateComposite(int *ceed, int *op, int *err) { if (CeedOperator_count == CeedOperator_count_max) { CeedOperator_count_max += CeedOperator_count_max / 2 + 1; CeedRealloc(CeedOperator_count_max, &CeedOperator_dict); @@ -958,7 +984,7 @@ CEED_EXTERN void fCeedCompositeOperatorCreate(int *ceed, int *op, int *err) { CeedOperator *op_ = &CeedOperator_dict[CeedOperator_count]; - *err = CeedCompositeOperatorCreate(Ceed_dict[*ceed], op_); + *err = CeedOperatorCreateComposite(Ceed_dict[*ceed], op_); if (*err) return; *op = CeedOperator_count++; CeedOperator_n++; @@ -1001,12 +1027,12 @@ CEED_EXTERN void fCeedOperatorSetField(int *op, const char *field_name, int *r, *err = CeedOperatorSetField(op_, field_name_c, r_, b_, v_); } -#define fCeedCompositeOperatorAddSub FORTRAN_NAME(ceedcompositeoperatoraddsub, CEEDCOMPOSITEOPERATORADDSUB) -CEED_EXTERN void fCeedCompositeOperatorAddSub(int *compositeop, int *subop, int *err) { +#define fCeedOperatorCompositeAddSub FORTRAN_NAME(ceedoperatorcompositeaddsub, CEEDOPERATORCOMPOSITEADDSUB) +CEED_EXTERN void fCeedOperatorCompositeAddSub(int *compositeop, int *subop, int *err) { CeedOperator compositeop_ = CeedOperator_dict[*compositeop]; CeedOperator subop_ = CeedOperator_dict[*subop]; - *err = CeedCompositeOperatorAddSub(compositeop_, subop_); + *err = CeedOperatorCompositeAddSub(compositeop_, subop_); } #define fCeedOperatorSetName FORTRAN_NAME(ceedoperatorsetname, CEEDOPERATORSETNAME) @@ -1017,6 +1043,13 @@ CEED_EXTERN void fCeedOperatorSetName(int *op, const char *name, int *err, fortr *err = CeedOperatorSetName(op_, name_c); } +#define fCeedOperatorSetNumViewTabs FORTRAN_NAME(ceedoperatorsetnumviewtabs, CEEDOPERATORSETNUMVIEWTABS) +CEED_EXTERN void fCeedOperatorSetNumViewTabs(int *op, int *ntabs, int *err) { + CeedOperator op_ = CeedOperator_dict[*op]; + + *err = CeedOperatorSetNumViewTabs(op_, *ntabs); +} + #define fCeedOperatorLinearAssembleQFunction FORTRAN_NAME(ceedoperatorlinearassembleqfunction, CEEDOPERATORLINEARASSEMBLEQFUNCTION) CEED_EXTERN void fCeedOperatorLinearAssembleQFunction(int *op, int *assembledvec, int *assembledrstr, int *rqst, int *err) { // Vector diff --git a/interface/ceed-hip.c b/interface/ceed-hip.c index cc4a625853..f35480e873 100644 --- a/interface/ceed-hip.c +++ b/interface/ceed-hip.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -23,10 +23,7 @@ **/ int CeedQFunctionSetHIPUserFunction(CeedQFunction qf, hipFunction_t f) { if (!qf->SetHIPUserFunction) { - Ceed ceed; - - CeedCall(CeedQFunctionGetCeed(qf, &ceed)); - CeedDebug(ceed, "Backend does not support hipFunction_t pointers for QFunctions."); + CeedDebug(CeedQFunctionReturnCeed(qf), "Backend does not support hipFunction_t pointers for QFunctions."); } else { CeedCall(qf->SetHIPUserFunction(qf, f)); } diff --git a/interface/ceed-jit-source-root-default.c b/interface/ceed-jit-source-root-default.c index 6f1bc47e6c..2cee49718f 100644 --- a/interface/ceed-jit-source-root-default.c +++ b/interface/ceed-jit-source-root-default.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause diff --git a/interface/ceed-jit-source-root-install.c b/interface/ceed-jit-source-root-install.c index ffa78b21d5..b80dca4f9f 100644 --- a/interface/ceed-jit-source-root-install.c +++ b/interface/ceed-jit-source-root-install.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause diff --git a/interface/ceed-jit-tools.c b/interface/ceed-jit-tools.c index 4d4bf44e51..c50e683f9a 100644 --- a/interface/ceed-jit-tools.c +++ b/interface/ceed-jit-tools.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -97,7 +97,7 @@ static int CeedNormalizePath(Ceed ceed, const char *source_file_path, char **nor while (last_slash[0] != '/' && last_slash != *normalized_source_file_path) last_slash--; CeedCheck(last_slash != *normalized_source_file_path, ceed, CEED_ERROR_MAJOR, "Malformed source path %s", source_file_path); - for (CeedInt i = 0; first_dot[i - 1]; i++) last_slash[i] = first_dot[i + 2]; + for (CeedInt i = 0; first_dot[i + 1]; i++) last_slash[i] = first_dot[i + 2]; search_from = last_slash; } } @@ -130,8 +130,6 @@ int CeedLoadSourceToInitializedBuffer(Ceed ceed, const char *source_file_path, C CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "---------- Ceed JiT ----------\n"); CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "Current source file: "); CeedDebug(ceed, "%s\n", source_file_path); - CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "Current buffer:\n"); - CeedDebug(ceed, "%s\n", *buffer); // Read file to temporary buffer source_file = fopen(source_file_path, "rb"); @@ -139,9 +137,15 @@ int CeedLoadSourceToInitializedBuffer(Ceed ceed, const char *source_file_path, C // -- Compute size of source fseek(source_file, 0L, SEEK_END); file_size = ftell(source_file); - rewind(source_file); + fseek(source_file, 0L, SEEK_SET); // -- Allocate memory for entire source file - CeedCall(CeedCalloc(file_size + 1, &temp_buffer)); + { + const int ierr = CeedCalloc(file_size + 1, &temp_buffer); + + // Close stream before error handling, if necessary + if (ierr != CEED_ERROR_SUCCESS) fclose(source_file); + CeedCall(ierr); + } // -- Copy the file into the buffer if (1 != fread(temp_buffer, file_size, 1, source_file)) { // LCOV_EXCL_START @@ -216,6 +220,9 @@ int CeedLoadSourceToInitializedBuffer(Ceed ceed, const char *source_file_path, C bool is_ceed_header = next_left_chevron && (next_new_line - next_left_chevron > 0) && (!strncmp(next_left_chevron, "", 14) || !strncmp(next_left_chevron, "", 17) || !strncmp(next_left_chevron, "", 17)); + bool is_std_header = + next_left_chevron && (next_new_line - next_left_chevron > 0) && + (!strncmp(next_left_chevron, "", 8) || !strncmp(next_left_chevron, "num_jit_source_roots; i++) { + CeedCallBackend(CeedGetJitSourceRoots(ceed, &num_source_dirs, &jit_source_dirs)); + for (CeedInt i = 0; i < num_source_dirs; i++) { bool is_valid; // Debug CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "Checking JiT root: "); - CeedDebug(ceed, "%s\n", ceed_parent->jit_source_roots[i]); + CeedDebug(ceed, "%s\n", jit_source_dirs[i]); // Build and check absolute path with current root - CeedCall(CeedPathConcatenate(ceed, ceed_parent->jit_source_roots[i], relative_file_path, (char **)absolute_file_path)); + CeedCall(CeedPathConcatenate(ceed, jit_source_dirs[i], relative_file_path, (char **)absolute_file_path)); CeedCall(CeedCheckFilePath(ceed, *absolute_file_path, &is_valid)); - if (is_valid) return CEED_ERROR_SUCCESS; + if (is_valid) { + CeedCallBackend(CeedRestoreJitSourceRoots(ceed, &jit_source_dirs)); + return CEED_ERROR_SUCCESS; + } // LCOV_EXCL_START - else CeedCall(CeedFree(absolute_file_path)); + else + CeedCall(CeedFree(absolute_file_path)); // LCOV_EXCL_STOP } // LCOV_EXCL_START diff --git a/interface/ceed-object.c b/interface/ceed-object.c new file mode 100644 index 0000000000..24b38dbb31 --- /dev/null +++ b/interface/ceed-object.c @@ -0,0 +1,185 @@ +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. +// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +// +// SPDX-License-Identifier: BSD-2-Clause +// +// This file is part of CEED: http://github.com/ceed + +#include +#include +#include + +/// @file +/// Implementation of CeedObject functionality + +/// ---------------------------------------------------------------------------- +/// CeedObject Backend API +/// ---------------------------------------------------------------------------- +/// @addtogroup CeedBackend +/// @{ + +/** + @brief Create a `CeedObject`. + + Note: This interface takes a `CeedObject` and not a pointer to a `CeedObject` like other `Ceed*Create` interfaces. + This `CeedObject` will have already been allocated a the first part of the `Ceed*` struct. + This function is only intended to be called inside of `Ceed*Create` functions. + + @param[in] ceed `Ceed` object to reference + @param[in] view_function `Ceed*` function for viewing the `obj` + @param[in] destroy_function `Ceed*` function for destroying the `obj` + @param[out] obj Address of the variable where is `CeedObject` exists + + @return An error code: 0 - success, otherwise - failure + + @ref Backend +**/ +int CeedObjectCreate(Ceed ceed, int (*view_function)(CeedObject, FILE *), int (*destroy_function)(CeedObject *), CeedObject obj) { + obj->ceed = NULL; + if (ceed) CeedCall(CeedReferenceCopy(ceed, &obj->ceed)); + obj->View = view_function; + CeedCheck(destroy_function, CeedObjectReturnCeed(obj), CEED_ERROR_UNSUPPORTED, "Must provide destroy function to create CeedObject"); + obj->Destroy = destroy_function; + obj->ref_count = 1; + return CEED_ERROR_SUCCESS; +} + +/** + @brief Increment the reference counter for a `CeedObject` + + @param[in,out] obj `CeedObject` to increment the reference counter + + @return An error code: 0 - success, otherwise - failure + + @ref Backend +**/ +int CeedObjectReference(CeedObject obj) { + obj->ref_count++; + return CEED_ERROR_SUCCESS; +} + +/** + @brief Decrement the reference counter for a `CeedObject` + + @param[in,out] obj `CeedObject` to decrement the reference counter + + @return The new reference count + + @ref Backend +**/ +int CeedObjectDereference(CeedObject obj) { + return --obj->ref_count; // prefix notation, to get new number of references +} + +/** + @brief Destroy a @ref CeedObject + + @param[in,out] obj `CeedObject` to destroy + + @return An error code: 0 - success, otherwise - failure + + @ref Backend +**/ +int CeedObjectDestroy_Private(CeedObject obj) { + CeedCheck(obj->ref_count == 0, CeedObjectReturnCeed(obj), CEED_ERROR_UNSUPPORTED, + "Cannot destroy CeedObject, it is still referenced by another object"); + if (obj->ceed) CeedCall(CeedDestroy(&obj->ceed)); + return CEED_ERROR_SUCCESS; +} + +/// @} + +/// ---------------------------------------------------------------------------- +/// CeedObject Public API +/// ---------------------------------------------------------------------------- +/// @addtogroup CeedUser +/// @{ + +/** + @brief View a `CeedObject` + + @param[in] obj `CeedObject` to view + @param[in] stream Stream to view to, e.g., `stdout` + + @return An error code: 0 - success, otherwise - failure + + @ref User +**/ +int CeedObjectView(CeedObject obj, FILE *stream) { + if (obj->View) CeedCall(obj->View(obj, stream)); + return CEED_ERROR_SUCCESS; +} + +/** + @brief Set the number of tabs to indent for @ref CeedObjectView() output + + @param[in] obj `CeedObject` to set the number of view tabs + @param[in] num_tabs Number of view tabs to set + + @return Error code: 0 - success, otherwise - failure + + @ref User +**/ +int CeedObjectSetNumViewTabs(CeedObject obj, CeedInt num_tabs) { + CeedCheck(num_tabs >= 0, CeedObjectReturnCeed(obj), CEED_ERROR_MINOR, "Number of view tabs must be non-negative"); + obj->num_view_tabs = num_tabs; + return CEED_ERROR_SUCCESS; +} + +/** + @brief Get the number of tabs to indent for @ref CeedObjectView() output + + @param[in] obj `CeedObject` to get the number of view tabs + @param[out] num_tabs Number of view tabs + + @return Error code: 0 - success, otherwise - failure + + @ref User +**/ +int CeedObjectGetNumViewTabs(CeedObject obj, CeedInt *num_tabs) { + *num_tabs = obj->num_view_tabs; + return CEED_ERROR_SUCCESS; +} + +/** + @brief Get the `Ceed` associated with a `CeedObject` + + @param[in] obj `CeedObject` + @param[out] ceed Variable to store `Ceed` + + @return An error code: 0 - success, otherwise - failure + + @ref Advanced +**/ +int CeedObjectGetCeed(CeedObject obj, Ceed *ceed) { + *ceed = NULL; + CeedCall(CeedReferenceCopy(CeedObjectReturnCeed(obj), ceed)); + return CEED_ERROR_SUCCESS; +} + +/** + @brief Return the `Ceed` associated with a `CeedObject` + + @param[in] obj `CeedObject` + + @return `Ceed` associated with the `basis` + + @ref Advanced +**/ +Ceed CeedObjectReturnCeed(CeedObject obj) { return (obj->ceed) ? obj->ceed : (Ceed)obj; } + +/** + @brief Destroy a @ref CeedObject + + @param[in,out] obj Address of `CeedObject` to destroy + + @return An error code: 0 - success, otherwise - failure + + @ref Backend +**/ +int CeedObjectDestroy(CeedObject *obj) { + CeedCall((*obj)->Destroy(obj)); + return CEED_ERROR_SUCCESS; +} + +/// @} diff --git a/interface/ceed-operator.c b/interface/ceed-operator.c index a4645dd2c6..55e72cb2c5 100644 --- a/interface/ceed-operator.c +++ b/interface/ceed-operator.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -87,17 +87,17 @@ static int CeedOperatorCheckField(Ceed ceed, CeedQFunctionField qf_field, CeedEl @param[in] op_field `CeedOperator` Field to view @param[in] qf_field `CeedQFunction` Field (carries field name) @param[in] field_number Number of field being viewed - @param[in] sub true indicates sub-operator, which increases indentation; false for top-level operator - @param[in] input true for an input field; false for output field + @param[in] tabs Tabs to append before each line + @param[in] is_input `true` for an input field; `false` for output field @param[in] stream Stream to view to, e.g., `stdout` @return An error code: 0 - success, otherwise - failure @ref Utility **/ -static int CeedOperatorFieldView(CeedOperatorField op_field, CeedQFunctionField qf_field, CeedInt field_number, bool sub, bool input, FILE *stream) { - const char *pre = sub ? " " : ""; - const char *in_out = input ? "Input" : "Output"; +static int CeedOperatorFieldView(CeedOperatorField op_field, CeedQFunctionField qf_field, CeedInt field_number, const char *tabs, bool is_input, + FILE *stream) { + const char *field_type = is_input ? "Input" : "Output"; const char *field_name; CeedInt size; CeedEvalMode eval_mode; @@ -112,12 +112,15 @@ static int CeedOperatorFieldView(CeedOperatorField op_field, CeedQFunctionField "%s %s field %" CeedInt_FMT ":\n" "%s Name: \"%s\"\n", - pre, in_out, field_number, pre, field_name); - fprintf(stream, "%s Size: %" CeedInt_FMT "\n", pre, size); - fprintf(stream, "%s EvalMode: %s\n", pre, CeedEvalModes[eval_mode]); - if (basis == CEED_BASIS_NONE) fprintf(stream, "%s No basis\n", pre); - if (vec == CEED_VECTOR_ACTIVE) fprintf(stream, "%s Active vector\n", pre); - else if (vec == CEED_VECTOR_NONE) fprintf(stream, "%s No vector\n", pre); + tabs, field_type, field_number, tabs, field_name); + fprintf(stream, "%s Size: %" CeedInt_FMT "\n", tabs, size); + fprintf(stream, "%s EvalMode: %s\n", tabs, CeedEvalModes[eval_mode]); + if (basis == CEED_BASIS_NONE) fprintf(stream, "%s No basis\n", tabs); + if (vec == CEED_VECTOR_ACTIVE) fprintf(stream, "%s Active vector\n", tabs); + else if (vec == CEED_VECTOR_NONE) fprintf(stream, "%s No vector\n", tabs); + + CeedCall(CeedVectorDestroy(&vec)); + CeedCall(CeedBasisDestroy(&basis)); return CEED_ERROR_SUCCESS; } @@ -125,42 +128,85 @@ static int CeedOperatorFieldView(CeedOperatorField op_field, CeedQFunctionField @brief View a single `CeedOperator` @param[in] op `CeedOperator` to view - @param[in] sub Boolean flag for sub-operator + @param[in] tabs Tabs to append before each new line @param[in] stream Stream to write; typically `stdout` or a file @return Error code: 0 - success, otherwise - failure @ref Utility **/ -int CeedOperatorSingleView(CeedOperator op, bool sub, FILE *stream) { - const char *pre = sub ? " " : ""; +int CeedOperatorSingleView(CeedOperator op, const char *tabs, FILE *stream) { + bool is_at_points; CeedInt num_elem, num_qpts, total_fields = 0, num_input_fields, num_output_fields; CeedQFunction qf; CeedQFunctionField *qf_input_fields, *qf_output_fields; CeedOperatorField *op_input_fields, *op_output_fields; + CeedCall(CeedOperatorIsAtPoints(op, &is_at_points)); CeedCall(CeedOperatorGetNumElements(op, &num_elem)); CeedCall(CeedOperatorGetNumQuadraturePoints(op, &num_qpts)); CeedCall(CeedOperatorGetNumArgs(op, &total_fields)); CeedCall(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); CeedCall(CeedOperatorGetQFunction(op, &qf)); CeedCall(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); + CeedCall(CeedQFunctionDestroy(&qf)); + + if (is_at_points) { + CeedInt max_points = 0; + CeedElemRestriction rstr_points; - fprintf(stream, "%s %" CeedInt_FMT " elements with %" CeedInt_FMT " quadrature points each\n", pre, num_elem, num_qpts); - fprintf(stream, "%s %" CeedInt_FMT " field%s\n", pre, total_fields, total_fields > 1 ? "s" : ""); - fprintf(stream, "%s %" CeedInt_FMT " input field%s:\n", pre, num_input_fields, num_input_fields > 1 ? "s" : ""); + CeedCall(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL)); + CeedCall(CeedElemRestrictionGetMaxPointsInElement(rstr_points, &max_points)); + fprintf(stream, "%s %" CeedInt_FMT " elements with %" CeedInt_FMT " max points each\n", tabs, num_elem, max_points); + CeedCall(CeedElemRestrictionDestroy(&rstr_points)); + } else { + fprintf(stream, "%s %" CeedInt_FMT " elements with %" CeedInt_FMT " quadrature points each\n", tabs, num_elem, num_qpts); + } + fprintf(stream, "%s %" CeedInt_FMT " field%s\n", tabs, total_fields, total_fields > 1 ? "s" : ""); + fprintf(stream, "%s %" CeedInt_FMT " input field%s:\n", tabs, num_input_fields, num_input_fields > 1 ? "s" : ""); for (CeedInt i = 0; i < num_input_fields; i++) { - CeedCall(CeedOperatorFieldView(op_input_fields[i], qf_input_fields[i], i, sub, 1, stream)); + CeedCall(CeedOperatorFieldView(op_input_fields[i], qf_input_fields[i], i, tabs, 1, stream)); } - fprintf(stream, "%s %" CeedInt_FMT " output field%s:\n", pre, num_output_fields, num_output_fields > 1 ? "s" : ""); + fprintf(stream, "%s %" CeedInt_FMT " output field%s:\n", tabs, num_output_fields, num_output_fields > 1 ? "s" : ""); for (CeedInt i = 0; i < num_output_fields; i++) { - CeedCall(CeedOperatorFieldView(op_output_fields[i], qf_output_fields[i], i, sub, 0, stream)); + CeedCall(CeedOperatorFieldView(op_output_fields[i], qf_output_fields[i], i, tabs, 0, stream)); } return CEED_ERROR_SUCCESS; } /** - @brief Find the active input vector `CeedBasis` for a non-composite `CeedOperator` + @brief View a `CeedOperator` passed as a `CeedObject` + + @param[in] op `CeedOperator` to view + @param[in] stream Filestream to write to + + @return An error code: 0 - success, otherwise - failure + + @ref Developer +**/ +static int CeedOperatorView_Object(CeedObject op, FILE *stream) { + CeedCall(CeedOperatorView((CeedOperator)op, stream)); + return CEED_ERROR_SUCCESS; +} + +/** + @brief Destroy a `CeedOperator` passed as a `CeedObject` + + @param[in,out] op Address of `CeedOperator` to destroy + + @return An error code: 0 - success, otherwise - failure + + @ref Developer +**/ +static int CeedOperatorDestroy_Object(CeedObject *op) { + CeedCall(CeedOperatorDestroy((CeedOperator *)op)); + return CEED_ERROR_SUCCESS; +} + +/** + @brief Find the active input vector `CeedBasis` for a non-composite `CeedOperator`. + + Note: Caller is responsible for destroying the `active_basis` with @ref CeedBasisDestroy(). @param[in] op `CeedOperator` to find active `CeedBasis` for @param[out] active_basis `CeedBasis` for active input vector or `NULL` for composite operator @@ -175,7 +221,9 @@ int CeedOperatorGetActiveBasis(CeedOperator op, CeedBasis *active_basis) { } /** - @brief Find the active input and output vector `CeedBasis` for a non-composite `CeedOperator` + @brief Find the active input and output vector `CeedBasis` for a non-composite `CeedOperator`. + + Note: Caller is responsible for destroying the bases with @ref CeedBasisDestroy(). @param[in] op `CeedOperator` to find active `CeedBasis` for @param[out] active_input_basis `CeedBasis` for active input vector or `NULL` for composite operator @@ -188,10 +236,8 @@ int CeedOperatorGetActiveBasis(CeedOperator op, CeedBasis *active_basis) { int CeedOperatorGetActiveBases(CeedOperator op, CeedBasis *active_input_basis, CeedBasis *active_output_basis) { bool is_composite; CeedInt num_input_fields, num_output_fields; - Ceed ceed; CeedOperatorField *op_input_fields, *op_output_fields; - CeedCall(CeedOperatorGetCeed(op, &ceed)); CeedCall(CeedOperatorIsComposite(op, &is_composite)); CeedCall(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); @@ -206,11 +252,14 @@ int CeedOperatorGetActiveBases(CeedOperator op, CeedBasis *active_input_basis, C CeedBasis basis; CeedCall(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); - CeedCheck(!*active_input_basis || *active_input_basis == basis, ceed, CEED_ERROR_MINOR, "Multiple active input CeedBases found"); - *active_input_basis = basis; + CeedCheck(!*active_input_basis || *active_input_basis == basis, CeedOperatorReturnCeed(op), CEED_ERROR_MINOR, + "Multiple active input CeedBases found"); + if (!*active_input_basis) CeedCall(CeedBasisReferenceCopy(basis, active_input_basis)); + CeedCall(CeedBasisDestroy(&basis)); } + CeedCall(CeedVectorDestroy(&vec)); } - CeedCheck(*active_input_basis, ceed, CEED_ERROR_INCOMPLETE, "No active input CeedBasis found"); + CeedCheck(*active_input_basis, CeedOperatorReturnCeed(op), CEED_ERROR_INCOMPLETE, "No active input CeedBasis found"); } } if (active_output_basis) { @@ -224,18 +273,23 @@ int CeedOperatorGetActiveBases(CeedOperator op, CeedBasis *active_input_basis, C CeedBasis basis; CeedCall(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); - CeedCheck(!*active_output_basis || *active_output_basis == basis, ceed, CEED_ERROR_MINOR, "Multiple active output CeedBases found"); - *active_output_basis = basis; + CeedCheck(!*active_output_basis || *active_output_basis == basis, CeedOperatorReturnCeed(op), CEED_ERROR_MINOR, + "Multiple active output CeedBases found"); + if (!*active_output_basis) CeedCall(CeedBasisReferenceCopy(basis, active_output_basis)); + CeedCall(CeedBasisDestroy(&basis)); } + CeedCall(CeedVectorDestroy(&vec)); } - CeedCheck(*active_output_basis, ceed, CEED_ERROR_INCOMPLETE, "No active output CeedBasis found"); + CeedCheck(*active_output_basis, CeedOperatorReturnCeed(op), CEED_ERROR_INCOMPLETE, "No active output CeedBasis found"); } } return CEED_ERROR_SUCCESS; } /** - @brief Find the active vector `CeedElemRestriction` for a non-composite `CeedOperator` + @brief Find the active vector `CeedElemRestriction` for a non-composite `CeedOperator`. + + Note: Caller is responsible for destroying the `active_rstr` with @ref CeedElemRestrictionDestroy(). @param[in] op `CeedOperator` to find active `CeedElemRestriction` for @param[out] active_rstr `CeedElemRestriction` for active input vector or NULL for composite operator @@ -250,7 +304,9 @@ int CeedOperatorGetActiveElemRestriction(CeedOperator op, CeedElemRestriction *a } /** - @brief Find the active input and output vector `CeedElemRestriction` for a non-composite `CeedOperator` + @brief Find the active input and output vector `CeedElemRestriction` for a non-composite `CeedOperator`. + + Note: Caller is responsible for destroying the restrictions with @ref CeedElemRestrictionDestroy(). @param[in] op `CeedOperator` to find active `CeedElemRestriction` for @param[out] active_input_rstr `CeedElemRestriction` for active input vector or NULL for composite operator @@ -263,10 +319,8 @@ int CeedOperatorGetActiveElemRestriction(CeedOperator op, CeedElemRestriction *a int CeedOperatorGetActiveElemRestrictions(CeedOperator op, CeedElemRestriction *active_input_rstr, CeedElemRestriction *active_output_rstr) { bool is_composite; CeedInt num_input_fields, num_output_fields; - Ceed ceed; CeedOperatorField *op_input_fields, *op_output_fields; - CeedCall(CeedOperatorGetCeed(op, &ceed)); CeedCall(CeedOperatorIsComposite(op, &is_composite)); CeedCall(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); @@ -281,11 +335,14 @@ int CeedOperatorGetActiveElemRestrictions(CeedOperator op, CeedElemRestriction * CeedElemRestriction rstr; CeedCall(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr)); - CeedCheck(!*active_input_rstr || *active_input_rstr == rstr, ceed, CEED_ERROR_MINOR, "Multiple active input CeedElemRestrictions found"); - *active_input_rstr = rstr; + CeedCheck(!*active_input_rstr || *active_input_rstr == rstr, CeedOperatorReturnCeed(op), CEED_ERROR_MINOR, + "Multiple active input CeedElemRestrictions found"); + if (!*active_input_rstr) CeedCall(CeedElemRestrictionReferenceCopy(rstr, active_input_rstr)); + CeedCall(CeedElemRestrictionDestroy(&rstr)); } + CeedCall(CeedVectorDestroy(&vec)); } - CeedCheck(*active_input_rstr, ceed, CEED_ERROR_INCOMPLETE, "No active input CeedElemRestriction found"); + CeedCheck(*active_input_rstr, CeedOperatorReturnCeed(op), CEED_ERROR_INCOMPLETE, "No active input CeedElemRestriction found"); } } if (active_output_rstr) { @@ -299,11 +356,14 @@ int CeedOperatorGetActiveElemRestrictions(CeedOperator op, CeedElemRestriction * CeedElemRestriction rstr; CeedCall(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &rstr)); - CeedCheck(!*active_output_rstr || *active_output_rstr == rstr, ceed, CEED_ERROR_MINOR, "Multiple active output CeedElemRestrictions found"); - *active_output_rstr = rstr; + CeedCheck(!*active_output_rstr || *active_output_rstr == rstr, CeedOperatorReturnCeed(op), CEED_ERROR_MINOR, + "Multiple active output CeedElemRestrictions found"); + if (!*active_output_rstr) CeedCall(CeedElemRestrictionReferenceCopy(rstr, active_output_rstr)); + CeedCall(CeedElemRestrictionDestroy(&rstr)); } + CeedCall(CeedVectorDestroy(&vec)); } - CeedCheck(*active_output_rstr, ceed, CEED_ERROR_INCOMPLETE, "No active output CeedElemRestriction found"); + CeedCheck(*active_output_rstr, CeedOperatorReturnCeed(op), CEED_ERROR_INCOMPLETE, "No active output CeedElemRestriction found"); } } return CEED_ERROR_SUCCESS; @@ -322,14 +382,12 @@ int CeedOperatorGetActiveElemRestrictions(CeedOperator op, CeedElemRestriction * @return An error code: 0 - success, otherwise - failure - @ref User + @ref Developer **/ static int CeedOperatorContextSetGeneric(CeedOperator op, CeedContextFieldLabel field_label, CeedContextFieldType field_type, void *values) { bool is_composite = false; - Ceed ceed; - CeedCall(CeedOperatorGetCeed(op, &ceed)); - CeedCheck(field_label, ceed, CEED_ERROR_UNSUPPORTED, "Invalid field label"); + CeedCheck(field_label, CeedOperatorReturnCeed(op), CEED_ERROR_UNSUPPORTED, "Invalid field label"); // Check if field_label and op correspond if (field_label->from_op) { @@ -338,7 +396,7 @@ static int CeedOperatorContextSetGeneric(CeedOperator op, CeedContextFieldLabel for (CeedInt i = 0; i < op->num_context_labels; i++) { if (op->context_labels[i] == field_label) index = i; } - CeedCheck(index != -1, ceed, CEED_ERROR_UNSUPPORTED, "ContextFieldLabel does not correspond to the operator"); + CeedCheck(index != -1, CeedOperatorReturnCeed(op), CEED_ERROR_UNSUPPORTED, "ContextFieldLabel does not correspond to the operator"); } CeedCall(CeedOperatorIsComposite(op, &is_composite)); @@ -346,29 +404,28 @@ static int CeedOperatorContextSetGeneric(CeedOperator op, CeedContextFieldLabel CeedInt num_sub; CeedOperator *sub_operators; - CeedCall(CeedCompositeOperatorGetNumSub(op, &num_sub)); - CeedCall(CeedCompositeOperatorGetSubList(op, &sub_operators)); - CeedCheck(num_sub == field_label->num_sub_labels, ceed, CEED_ERROR_UNSUPPORTED, "Composite operator modified after ContextFieldLabel created"); + CeedCall(CeedOperatorCompositeGetNumSub(op, &num_sub)); + CeedCall(CeedOperatorCompositeGetSubList(op, &sub_operators)); + CeedCheck(num_sub == field_label->num_sub_labels, CeedOperatorReturnCeed(op), CEED_ERROR_UNSUPPORTED, + "Composite operator modified after ContextFieldLabel created"); for (CeedInt i = 0; i < num_sub; i++) { - CeedQFunction qf; CeedQFunctionContext ctx; - CeedCall(CeedOperatorGetQFunction(sub_operators[i], &qf)); - CeedCall(CeedQFunctionGetContext(qf, &ctx)); + CeedCall(CeedOperatorGetContext(sub_operators[i], &ctx)); // Try every sub-operator, ok if some sub-operators do not have field - if (field_label->sub_labels[i] && ctx) { + if (ctx && field_label->sub_labels[i]) { CeedCall(CeedQFunctionContextSetGeneric(ctx, field_label->sub_labels[i], field_type, values)); } + CeedCall(CeedQFunctionContextDestroy(&ctx)); } } else { - CeedQFunction qf; CeedQFunctionContext ctx; - CeedCall(CeedOperatorGetQFunction(op, &qf)); - CeedCall(CeedQFunctionGetContext(qf, &ctx)); - CeedCheck(ctx, ceed, CEED_ERROR_UNSUPPORTED, "QFunction does not have context data"); + CeedCall(CeedOperatorGetContext(op, &ctx)); + CeedCheck(ctx, CeedOperatorReturnCeed(op), CEED_ERROR_UNSUPPORTED, "QFunction does not have context data"); CeedCall(CeedQFunctionContextSetGeneric(ctx, field_label, field_type, values)); + CeedCall(CeedQFunctionContextDestroy(&ctx)); } CeedCall(CeedOperatorSetQFunctionAssemblyDataUpdateNeeded(op, true)); return CEED_ERROR_SUCCESS; @@ -388,15 +445,13 @@ static int CeedOperatorContextSetGeneric(CeedOperator op, CeedContextFieldLabel @return An error code: 0 - success, otherwise - failure - @ref User + @ref Developer **/ static int CeedOperatorContextGetGenericRead(CeedOperator op, CeedContextFieldLabel field_label, CeedContextFieldType field_type, size_t *num_values, void *values) { bool is_composite = false; - Ceed ceed; - CeedCall(CeedOperatorGetCeed(op, &ceed)); - CeedCheck(field_label, ceed, CEED_ERROR_UNSUPPORTED, "Invalid field label"); + CeedCheck(field_label, CeedOperatorReturnCeed(op), CEED_ERROR_UNSUPPORTED, "Invalid field label"); *(void **)values = NULL; *num_values = 0; @@ -408,7 +463,7 @@ static int CeedOperatorContextGetGenericRead(CeedOperator op, CeedContextFieldLa for (CeedInt i = 0; i < op->num_context_labels; i++) { if (op->context_labels[i] == field_label) index = i; } - CeedCheck(index != -1, ceed, CEED_ERROR_UNSUPPORTED, "ContextFieldLabel does not correspond to the operator"); + CeedCheck(index != -1, CeedOperatorReturnCeed(op), CEED_ERROR_UNSUPPORTED, "ContextFieldLabel does not correspond to the operator"); } CeedCall(CeedOperatorIsComposite(op, &is_composite)); @@ -416,30 +471,30 @@ static int CeedOperatorContextGetGenericRead(CeedOperator op, CeedContextFieldLa CeedInt num_sub; CeedOperator *sub_operators; - CeedCall(CeedCompositeOperatorGetNumSub(op, &num_sub)); - CeedCall(CeedCompositeOperatorGetSubList(op, &sub_operators)); - CeedCheck(num_sub == field_label->num_sub_labels, ceed, CEED_ERROR_UNSUPPORTED, "Composite operator modified after ContextFieldLabel created"); + CeedCall(CeedOperatorCompositeGetNumSub(op, &num_sub)); + CeedCall(CeedOperatorCompositeGetSubList(op, &sub_operators)); + CeedCheck(num_sub == field_label->num_sub_labels, CeedOperatorReturnCeed(op), CEED_ERROR_UNSUPPORTED, + "Composite operator modified after ContextFieldLabel created"); for (CeedInt i = 0; i < num_sub; i++) { - CeedQFunction qf; CeedQFunctionContext ctx; - CeedCall(CeedOperatorGetQFunction(sub_operators[i], &qf)); - CeedCall(CeedQFunctionGetContext(qf, &ctx)); + CeedCall(CeedOperatorGetContext(sub_operators[i], &ctx)); // Try every sub-operator, ok if some sub-operators do not have field - if (field_label->sub_labels[i] && ctx) { + if (ctx && field_label->sub_labels[i]) { CeedCall(CeedQFunctionContextGetGenericRead(ctx, field_label->sub_labels[i], field_type, num_values, values)); + CeedCall(CeedQFunctionContextDestroy(&ctx)); return CEED_ERROR_SUCCESS; } + CeedCall(CeedQFunctionContextDestroy(&ctx)); } } else { - CeedQFunction qf; CeedQFunctionContext ctx; - CeedCall(CeedOperatorGetQFunction(op, &qf)); - CeedCall(CeedQFunctionGetContext(qf, &ctx)); - CeedCheck(ctx, ceed, CEED_ERROR_UNSUPPORTED, "QFunction does not have context data"); + CeedCall(CeedOperatorGetContext(op, &ctx)); + CeedCheck(ctx, CeedOperatorReturnCeed(op), CEED_ERROR_UNSUPPORTED, "QFunction does not have context data"); CeedCall(CeedQFunctionContextGetGenericRead(ctx, field_label, field_type, num_values, values)); + CeedCall(CeedQFunctionContextDestroy(&ctx)); } return CEED_ERROR_SUCCESS; } @@ -457,14 +512,12 @@ static int CeedOperatorContextGetGenericRead(CeedOperator op, CeedContextFieldLa @return An error code: 0 - success, otherwise - failure - @ref User + @ref Developer **/ static int CeedOperatorContextRestoreGenericRead(CeedOperator op, CeedContextFieldLabel field_label, CeedContextFieldType field_type, void *values) { bool is_composite = false; - Ceed ceed; - CeedCall(CeedOperatorGetCeed(op, &ceed)); - CeedCheck(field_label, ceed, CEED_ERROR_UNSUPPORTED, "Invalid field label"); + CeedCheck(field_label, CeedOperatorReturnCeed(op), CEED_ERROR_UNSUPPORTED, "Invalid field label"); // Check if field_label and op correspond if (field_label->from_op) { @@ -473,7 +526,7 @@ static int CeedOperatorContextRestoreGenericRead(CeedOperator op, CeedContextFie for (CeedInt i = 0; i < op->num_context_labels; i++) { if (op->context_labels[i] == field_label) index = i; } - CeedCheck(index != -1, ceed, CEED_ERROR_UNSUPPORTED, "ContextFieldLabel does not correspond to the operator"); + CeedCheck(index != -1, CeedOperatorReturnCeed(op), CEED_ERROR_UNSUPPORTED, "ContextFieldLabel does not correspond to the operator"); } CeedCall(CeedOperatorIsComposite(op, &is_composite)); @@ -481,30 +534,30 @@ static int CeedOperatorContextRestoreGenericRead(CeedOperator op, CeedContextFie CeedInt num_sub; CeedOperator *sub_operators; - CeedCall(CeedCompositeOperatorGetNumSub(op, &num_sub)); - CeedCall(CeedCompositeOperatorGetSubList(op, &sub_operators)); - CeedCheck(num_sub == field_label->num_sub_labels, ceed, CEED_ERROR_UNSUPPORTED, "Composite operator modified after ContextFieldLabel created"); + CeedCall(CeedOperatorCompositeGetNumSub(op, &num_sub)); + CeedCall(CeedOperatorCompositeGetSubList(op, &sub_operators)); + CeedCheck(num_sub == field_label->num_sub_labels, CeedOperatorReturnCeed(op), CEED_ERROR_UNSUPPORTED, + "Composite operator modified after ContextFieldLabel created"); for (CeedInt i = 0; i < num_sub; i++) { - CeedQFunction qf; CeedQFunctionContext ctx; - CeedCall(CeedOperatorGetQFunction(sub_operators[i], &qf)); - CeedCall(CeedQFunctionGetContext(qf, &ctx)); + CeedCall(CeedOperatorGetContext(sub_operators[i], &ctx)); // Try every sub-operator, ok if some sub-operators do not have field - if (field_label->sub_labels[i] && ctx) { + if (ctx && field_label->sub_labels[i]) { CeedCall(CeedQFunctionContextRestoreGenericRead(ctx, field_label->sub_labels[i], field_type, values)); + CeedCall(CeedQFunctionContextDestroy(&ctx)); return CEED_ERROR_SUCCESS; } + CeedCall(CeedQFunctionContextDestroy(&ctx)); } } else { - CeedQFunction qf; CeedQFunctionContext ctx; - CeedCall(CeedOperatorGetQFunction(op, &qf)); - CeedCall(CeedQFunctionGetContext(qf, &ctx)); - CeedCheck(ctx, ceed, CEED_ERROR_UNSUPPORTED, "QFunction does not have context data"); + CeedCall(CeedOperatorGetContext(op, &ctx)); + CeedCheck(ctx, CeedOperatorReturnCeed(op), CEED_ERROR_UNSUPPORTED, "QFunction does not have context data"); CeedCall(CeedQFunctionContextRestoreGenericRead(ctx, field_label, field_type, values)); + CeedCall(CeedQFunctionContextDestroy(&ctx)); } return CEED_ERROR_SUCCESS; } @@ -561,8 +614,9 @@ int CeedOperatorHasTensorBases(CeedOperator op, bool *has_tensor_bases) { CeedCall(CeedOperatorFieldGetBasis(input_fields[i], &basis)); if (basis != CEED_BASIS_NONE) { CeedCall(CeedBasisIsTensor(basis, &is_tensor)); - *has_tensor_bases &= is_tensor; + *has_tensor_bases = *has_tensor_bases & is_tensor; } + CeedCall(CeedBasisDestroy(&basis)); } for (CeedInt i = 0; i < num_outputs; i++) { bool is_tensor; @@ -571,8 +625,9 @@ int CeedOperatorHasTensorBases(CeedOperator op, bool *has_tensor_bases) { CeedCall(CeedOperatorFieldGetBasis(output_fields[i], &basis)); if (basis != CEED_BASIS_NONE) { CeedCall(CeedBasisIsTensor(basis, &is_tensor)); - *has_tensor_bases &= is_tensor; + *has_tensor_bases = *has_tensor_bases & is_tensor; } + CeedCall(CeedBasisDestroy(&basis)); } return CEED_ERROR_SUCCESS; } @@ -622,7 +677,8 @@ int CeedOperatorGetQFunction(CeedOperator op, CeedQFunction *qf) { CeedCall(CeedOperatorIsComposite(op, &is_composite)); CeedCheck(!is_composite, CeedOperatorReturnCeed(op), CEED_ERROR_MINOR, "Not defined for composite operator"); - *qf = op->qf; + *qf = NULL; + CeedCall(CeedQFunctionReferenceCopy(op->qf, qf)); return CEED_ERROR_SUCCESS; } @@ -681,7 +737,7 @@ int CeedOperatorSetData(CeedOperator op, void *data) { @ref Backend **/ int CeedOperatorReference(CeedOperator op) { - op->ref_count++; + CeedCall(CeedObjectReference((CeedObject)op)); return CEED_ERROR_SUCCESS; } @@ -727,22 +783,21 @@ int CeedOperatorCreate(Ceed ceed, CeedQFunction qf, CeedQFunction dqf, CeedQFunc Ceed delegate; CeedCall(CeedGetObjectDelegate(ceed, &delegate, "Operator")); - CeedCheck(delegate, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not support CeedOperatorCreate"); + CeedCheck(delegate, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement CeedOperatorCreate"); CeedCall(CeedOperatorCreate(delegate, qf, dqf, dqfT, op)); + CeedCall(CeedDestroy(&delegate)); return CEED_ERROR_SUCCESS; } CeedCheck(qf && qf != CEED_QFUNCTION_NONE, ceed, CEED_ERROR_MINOR, "Operator must have a valid CeedQFunction."); CeedCall(CeedCalloc(1, op)); - CeedCall(CeedReferenceCopy(ceed, &(*op)->ceed)); - (*op)->ref_count = 1; + CeedCall(CeedObjectCreate(ceed, CeedOperatorView_Object, CeedOperatorDestroy_Object, &(*op)->obj)); (*op)->input_size = -1; (*op)->output_size = -1; CeedCall(CeedQFunctionReferenceCopy(qf, &(*op)->qf)); if (dqf && dqf != CEED_QFUNCTION_NONE) CeedCall(CeedQFunctionReferenceCopy(dqf, &(*op)->dqf)); if (dqfT && dqfT != CEED_QFUNCTION_NONE) CeedCall(CeedQFunctionReferenceCopy(dqfT, &(*op)->dqfT)); - CeedCall(CeedQFunctionAssemblyDataCreate(ceed, &(*op)->qf_assembled)); CeedCall(CeedCalloc(CEED_FIELD_MAX, &(*op)->input_fields)); CeedCall(CeedCalloc(CEED_FIELD_MAX, &(*op)->output_fields)); CeedCall(ceed->OperatorCreate(*op)); @@ -770,23 +825,22 @@ int CeedOperatorCreateAtPoints(Ceed ceed, CeedQFunction qf, CeedQFunction dqf, C Ceed delegate; CeedCall(CeedGetObjectDelegate(ceed, &delegate, "Operator")); - CeedCheck(delegate, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not support CeedOperatorCreateAtPoints"); + CeedCheck(delegate, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement CeedOperatorCreateAtPoints"); CeedCall(CeedOperatorCreateAtPoints(delegate, qf, dqf, dqfT, op)); + CeedCall(CeedDestroy(&delegate)); return CEED_ERROR_SUCCESS; } CeedCheck(qf && qf != CEED_QFUNCTION_NONE, ceed, CEED_ERROR_MINOR, "Operator must have a valid CeedQFunction."); CeedCall(CeedCalloc(1, op)); - CeedCall(CeedReferenceCopy(ceed, &(*op)->ceed)); - (*op)->ref_count = 1; + CeedCall(CeedObjectCreate(ceed, CeedOperatorView_Object, CeedOperatorDestroy_Object, &(*op)->obj)); (*op)->is_at_points = true; (*op)->input_size = -1; (*op)->output_size = -1; CeedCall(CeedQFunctionReferenceCopy(qf, &(*op)->qf)); if (dqf && dqf != CEED_QFUNCTION_NONE) CeedCall(CeedQFunctionReferenceCopy(dqf, &(*op)->dqf)); if (dqfT && dqfT != CEED_QFUNCTION_NONE) CeedCall(CeedQFunctionReferenceCopy(dqfT, &(*op)->dqfT)); - CeedCall(CeedQFunctionAssemblyDataCreate(ceed, &(*op)->qf_assembled)); CeedCall(CeedCalloc(CEED_FIELD_MAX, &(*op)->input_fields)); CeedCall(CeedCalloc(CEED_FIELD_MAX, &(*op)->output_fields)); CeedCall(ceed->OperatorCreateAtPoints(*op)); @@ -803,20 +857,20 @@ int CeedOperatorCreateAtPoints(Ceed ceed, CeedQFunction qf, CeedQFunction dqf, C @ref User */ -int CeedCompositeOperatorCreate(Ceed ceed, CeedOperator *op) { +int CeedOperatorCreateComposite(Ceed ceed, CeedOperator *op) { if (!ceed->CompositeOperatorCreate) { Ceed delegate; CeedCall(CeedGetObjectDelegate(ceed, &delegate, "Operator")); if (delegate) { - CeedCall(CeedCompositeOperatorCreate(delegate, op)); + CeedCall(CeedOperatorCreateComposite(delegate, op)); + CeedCall(CeedDestroy(&delegate)); return CEED_ERROR_SUCCESS; } } CeedCall(CeedCalloc(1, op)); - CeedCall(CeedReferenceCopy(ceed, &(*op)->ceed)); - (*op)->ref_count = 1; + CeedCall(CeedObjectCreate(ceed, CeedOperatorView_Object, CeedOperatorDestroy_Object, &(*op)->obj)); (*op)->is_composite = true; CeedCall(CeedCalloc(CEED_COMPOSITE_MAX, &(*op)->sub_operators)); (*op)->input_size = -1; @@ -874,38 +928,38 @@ int CeedOperatorReferenceCopy(CeedOperator op, CeedOperator *op_copy) { int CeedOperatorSetField(CeedOperator op, const char *field_name, CeedElemRestriction rstr, CeedBasis basis, CeedVector vec) { bool is_input = true, is_at_points, is_composite, is_immutable; CeedInt num_elem = 0, num_qpts = 0, num_input_fields, num_output_fields; - Ceed ceed; CeedQFunction qf; CeedQFunctionField qf_field, *qf_input_fields, *qf_output_fields; CeedOperatorField *op_field; - CeedCall(CeedOperatorGetCeed(op, &ceed)); CeedCall(CeedOperatorIsAtPoints(op, &is_at_points)); CeedCall(CeedOperatorIsComposite(op, &is_composite)); CeedCall(CeedOperatorIsImmutable(op, &is_immutable)); - CeedCheck(!is_composite, ceed, CEED_ERROR_INCOMPATIBLE, "Cannot add field to composite operator."); - CeedCheck(!is_immutable, ceed, CEED_ERROR_MAJOR, "Operator cannot be changed after set as immutable"); - CeedCheck(rstr, ceed, CEED_ERROR_INCOMPATIBLE, "CeedElemRestriction rstr for field \"%s\" must be non-NULL.", field_name); - CeedCheck(basis, ceed, CEED_ERROR_INCOMPATIBLE, "CeedBasis basis for field \"%s\" must be non-NULL.", field_name); - CeedCheck(vec, ceed, CEED_ERROR_INCOMPATIBLE, "CeedVector vec for field \"%s\" must be non-NULL.", field_name); + CeedCheck(!is_composite, CeedOperatorReturnCeed(op), CEED_ERROR_INCOMPATIBLE, "Cannot add field to composite operator."); + CeedCheck(!is_immutable, CeedOperatorReturnCeed(op), CEED_ERROR_MAJOR, "Operator cannot be changed after set as immutable"); + CeedCheck(rstr, CeedOperatorReturnCeed(op), CEED_ERROR_INCOMPATIBLE, "CeedElemRestriction rstr for field \"%s\" must be non-NULL.", field_name); + CeedCheck(basis, CeedOperatorReturnCeed(op), CEED_ERROR_INCOMPATIBLE, "CeedBasis basis for field \"%s\" must be non-NULL.", field_name); + CeedCheck(vec, CeedOperatorReturnCeed(op), CEED_ERROR_INCOMPATIBLE, "CeedVector vec for field \"%s\" must be non-NULL.", field_name); CeedCall(CeedElemRestrictionGetNumElements(rstr, &num_elem)); - CeedCheck(rstr == CEED_ELEMRESTRICTION_NONE || !op->has_restriction || num_elem == op->num_elem, ceed, CEED_ERROR_DIMENSION, + CeedCheck(rstr == CEED_ELEMRESTRICTION_NONE || !op->has_restriction || num_elem == op->num_elem, CeedOperatorReturnCeed(op), CEED_ERROR_DIMENSION, "CeedElemRestriction with %" CeedInt_FMT " elements incompatible with prior %" CeedInt_FMT " elements", num_elem, op->num_elem); { CeedRestrictionType rstr_type; CeedCall(CeedElemRestrictionGetType(rstr, &rstr_type)); if (rstr_type == CEED_RESTRICTION_POINTS) { - CeedCheck(is_at_points, ceed, CEED_ERROR_UNSUPPORTED, "CeedElemRestriction AtPoints not supported for standard operator fields"); - CeedCheck(basis == CEED_BASIS_NONE, ceed, CEED_ERROR_UNSUPPORTED, "CeedElemRestriction AtPoints must be used with CEED_BASIS_NONE"); + CeedCheck(is_at_points, CeedOperatorReturnCeed(op), CEED_ERROR_UNSUPPORTED, + "CeedElemRestriction AtPoints not supported for standard operator fields"); + CeedCheck(basis == CEED_BASIS_NONE, CeedOperatorReturnCeed(op), CEED_ERROR_UNSUPPORTED, + "CeedElemRestriction AtPoints must be used with CEED_BASIS_NONE"); if (!op->first_points_rstr) { CeedCall(CeedElemRestrictionReferenceCopy(rstr, &op->first_points_rstr)); } else { bool are_compatible; CeedCall(CeedElemRestrictionAtPointsAreCompatible(op->first_points_rstr, rstr, &are_compatible)); - CeedCheck(are_compatible, ceed, CEED_ERROR_INCOMPATIBLE, + CeedCheck(are_compatible, CeedOperatorReturnCeed(op), CEED_ERROR_INCOMPATIBLE, "CeedElemRestriction must have compatible offsets with previously set CeedElemRestriction"); } } @@ -913,13 +967,14 @@ int CeedOperatorSetField(CeedOperator op, const char *field_name, CeedElemRestri if (basis == CEED_BASIS_NONE) CeedCall(CeedElemRestrictionGetElementSize(rstr, &num_qpts)); else CeedCall(CeedBasisGetNumQuadraturePoints(basis, &num_qpts)); - CeedCheck(op->num_qpts == 0 || num_qpts == op->num_qpts, ceed, CEED_ERROR_DIMENSION, + CeedCheck(op->num_qpts == 0 || num_qpts == op->num_qpts, CeedOperatorReturnCeed(op), CEED_ERROR_DIMENSION, "%s must correspond to the same number of quadrature points as previously added CeedBases. Found %" CeedInt_FMT " quadrature points but expected %" CeedInt_FMT " quadrature points.", basis == CEED_BASIS_NONE ? "CeedElemRestriction" : "CeedBasis", num_qpts, op->num_qpts); CeedCall(CeedOperatorGetQFunction(op, &qf)); CeedCall(CeedQFunctionGetFields(qf, &num_input_fields, &qf_input_fields, &num_output_fields, &qf_output_fields)); + CeedCall(CeedQFunctionDestroy(&qf)); for (CeedInt i = 0; i < num_input_fields; i++) { const char *qf_field_name; @@ -942,10 +997,10 @@ int CeedOperatorSetField(CeedOperator op, const char *field_name, CeedElemRestri } } // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_INCOMPLETE, "CeedQFunction has no knowledge of field '%s'", field_name); + return CeedError(CeedOperatorReturnCeed(op), CEED_ERROR_INCOMPLETE, "CeedQFunction has no knowledge of field '%s'", field_name); // LCOV_EXCL_STOP found: - CeedCall(CeedOperatorCheckField(ceed, qf_field, rstr, basis)); + CeedCall(CeedOperatorCheckField(CeedOperatorReturnCeed(op), qf_field, rstr, basis)); CeedCall(CeedCalloc(1, op_field)); if (vec == CEED_VECTOR_ACTIVE) { @@ -954,11 +1009,11 @@ int CeedOperatorSetField(CeedOperator op, const char *field_name, CeedElemRestri CeedCall(CeedElemRestrictionGetLVectorSize(rstr, &l_size)); if (is_input) { if (op->input_size == -1) op->input_size = l_size; - CeedCheck(l_size == op->input_size, ceed, CEED_ERROR_INCOMPATIBLE, + CeedCheck(l_size == op->input_size, CeedOperatorReturnCeed(op), CEED_ERROR_INCOMPATIBLE, "LVector size %" CeedSize_FMT " does not match previous size %" CeedSize_FMT "", l_size, op->input_size); } else { if (op->output_size == -1) op->output_size = l_size; - CeedCheck(l_size == op->output_size, ceed, CEED_ERROR_INCOMPATIBLE, + CeedCheck(l_size == op->output_size, CeedOperatorReturnCeed(op), CEED_ERROR_INCOMPATIBLE, "LVector size %" CeedSize_FMT " does not match previous size %" CeedSize_FMT "", l_size, op->output_size); } } @@ -1002,6 +1057,7 @@ int CeedOperatorGetFields(CeedOperator op, CeedInt *num_input_fields, CeedOperat CeedCall(CeedOperatorGetQFunction(op, &qf)); CeedCall(CeedQFunctionGetFields(qf, num_input_fields, NULL, num_output_fields, NULL)); + CeedCall(CeedQFunctionDestroy(&qf)); if (input_fields) *input_fields = op->input_fields; if (output_fields) *output_fields = op->output_fields; return CEED_ERROR_SUCCESS; @@ -1022,13 +1078,11 @@ int CeedOperatorGetFields(CeedOperator op, CeedInt *num_input_fields, CeedOperat **/ int CeedOperatorAtPointsSetPoints(CeedOperator op, CeedElemRestriction rstr_points, CeedVector point_coords) { bool is_at_points, is_immutable; - Ceed ceed; - CeedCall(CeedOperatorGetCeed(op, &ceed)); CeedCall(CeedOperatorIsAtPoints(op, &is_at_points)); CeedCall(CeedOperatorIsImmutable(op, &is_immutable)); - CeedCheck(is_at_points, ceed, CEED_ERROR_MINOR, "Only defined for operator at points"); - CeedCheck(!is_immutable, ceed, CEED_ERROR_MAJOR, "Operator cannot be changed after set as immutable"); + CeedCheck(is_at_points, CeedOperatorReturnCeed(op), CEED_ERROR_MINOR, "Only defined for operator at points"); + CeedCheck(!is_immutable, CeedOperatorReturnCeed(op), CEED_ERROR_MAJOR, "Operator cannot be changed after set as immutable"); if (!op->first_points_rstr) { CeedCall(CeedElemRestrictionReferenceCopy(rstr_points, &op->first_points_rstr)); @@ -1036,7 +1090,7 @@ int CeedOperatorAtPointsSetPoints(CeedOperator op, CeedElemRestriction rstr_poin bool are_compatible; CeedCall(CeedElemRestrictionAtPointsAreCompatible(op->first_points_rstr, rstr_points, &are_compatible)); - CeedCheck(are_compatible, ceed, CEED_ERROR_INCOMPATIBLE, + CeedCheck(are_compatible, CeedOperatorReturnCeed(op), CEED_ERROR_INCOMPATIBLE, "CeedElemRestriction must have compatible offsets with previously set field CeedElemRestriction"); } @@ -1047,10 +1101,10 @@ int CeedOperatorAtPointsSetPoints(CeedOperator op, CeedElemRestriction rstr_poin /** @brief Get a boolean value indicating if the `CeedOperator` was created with `CeedOperatorCreateAtPoints` - + @param[in] op `CeedOperator` @param[out] is_at_points Variable to store at points status - + @return An error code: 0 - success, otherwise - failure @ref User @@ -1080,8 +1134,14 @@ int CeedOperatorAtPointsGetPoints(CeedOperator op, CeedElemRestriction *rstr_poi CeedCheck(is_at_points, CeedOperatorReturnCeed(op), CEED_ERROR_MINOR, "Only defined for operator at points"); CeedCall(CeedOperatorCheckReady(op)); - if (rstr_points) CeedCall(CeedElemRestrictionReferenceCopy(op->rstr_points, rstr_points)); - if (point_coords) CeedCall(CeedVectorReferenceCopy(op->point_coords, point_coords)); + if (rstr_points) { + *rstr_points = NULL; + CeedCall(CeedElemRestrictionReferenceCopy(op->rstr_points, rstr_points)); + } + if (point_coords) { + *point_coords = NULL; + CeedCall(CeedVectorReferenceCopy(op->point_coords, point_coords)); + } return CEED_ERROR_SUCCESS; } @@ -1140,7 +1200,9 @@ int CeedOperatorFieldGetName(CeedOperatorField op_field, const char **field_name } /** - @brief Get the `CeedElemRestriction` of a `CeedOperator` Field + @brief Get the `CeedElemRestriction` of a `CeedOperator` Field. + + Note: Caller is responsible for destroying the `rstr` with @ref CeedElemRestrictionDestroy(). @param[in] op_field `CeedOperator` Field @param[out] rstr Variable to store `CeedElemRestriction` @@ -1150,12 +1212,15 @@ int CeedOperatorFieldGetName(CeedOperatorField op_field, const char **field_name @ref Advanced **/ int CeedOperatorFieldGetElemRestriction(CeedOperatorField op_field, CeedElemRestriction *rstr) { - *rstr = op_field->elem_rstr; + *rstr = NULL; + CeedCall(CeedElemRestrictionReferenceCopy(op_field->elem_rstr, rstr)); return CEED_ERROR_SUCCESS; } /** - @brief Get the `CeedBasis` of a `CeedOperator` Field + @brief Get the `CeedBasis` of a `CeedOperator` Field. + + Note: Caller is responsible for destroying the `basis` with @ref CeedBasisDestroy(). @param[in] op_field `CeedOperator` Field @param[out] basis Variable to store `CeedBasis` @@ -1165,12 +1230,15 @@ int CeedOperatorFieldGetElemRestriction(CeedOperatorField op_field, CeedElemRest @ref Advanced **/ int CeedOperatorFieldGetBasis(CeedOperatorField op_field, CeedBasis *basis) { - *basis = op_field->basis; + *basis = NULL; + CeedCall(CeedBasisReferenceCopy(op_field->basis, basis)); return CEED_ERROR_SUCCESS; } /** - @brief Get the `CeedVector` of a `CeedOperator` Field + @brief Get the `CeedVector` of a `CeedOperator` Field. + + Note: Caller is responsible for destroying the `vec` with @ref CeedVectorDestroy(). @param[in] op_field `CeedOperator` Field @param[out] vec Variable to store `CeedVector` @@ -1180,14 +1248,17 @@ int CeedOperatorFieldGetBasis(CeedOperatorField op_field, CeedBasis *basis) { @ref Advanced **/ int CeedOperatorFieldGetVector(CeedOperatorField op_field, CeedVector *vec) { - *vec = op_field->vec; + *vec = NULL; + CeedCall(CeedVectorReferenceCopy(op_field->vec, vec)); return CEED_ERROR_SUCCESS; } /** @brief Get the data of a `CeedOperator` Field. - Any arguments set as `NULL` are ignored. + Any arguments set as `NULL` are ignored.. + + Note: Caller is responsible for destroying the `rstr`, `basis`, and `vec`. @param[in] op_field `CeedOperator` Field @param[out] field_name Variable to store the field name @@ -1217,15 +1288,14 @@ int CeedOperatorFieldGetData(CeedOperatorField op_field, const char **field_name @ref User */ -int CeedCompositeOperatorAddSub(CeedOperator composite_op, CeedOperator sub_op) { +int CeedOperatorCompositeAddSub(CeedOperator composite_op, CeedOperator sub_op) { bool is_immutable; - Ceed ceed; - CeedCall(CeedOperatorGetCeed(composite_op, &ceed)); - CeedCheck(composite_op->is_composite, ceed, CEED_ERROR_MINOR, "CeedOperator is not a composite operator"); - CeedCheck(composite_op->num_suboperators < CEED_COMPOSITE_MAX, ceed, CEED_ERROR_UNSUPPORTED, "Cannot add additional sub-operators"); + CeedCheck(composite_op->is_composite, CeedOperatorReturnCeed(composite_op), CEED_ERROR_MINOR, "CeedOperator is not a composite operator"); + CeedCheck(composite_op->num_suboperators < CEED_COMPOSITE_MAX, CeedOperatorReturnCeed(composite_op), CEED_ERROR_UNSUPPORTED, + "Cannot add additional sub-operators"); CeedCall(CeedOperatorIsImmutable(composite_op, &is_immutable)); - CeedCheck(!is_immutable, ceed, CEED_ERROR_MAJOR, "Operator cannot be changed after set as immutable"); + CeedCheck(!is_immutable, CeedOperatorReturnCeed(composite_op), CEED_ERROR_MAJOR, "Operator cannot be changed after set as immutable"); { CeedSize input_size, output_size; @@ -1234,8 +1304,8 @@ int CeedCompositeOperatorAddSub(CeedOperator composite_op, CeedOperator sub_op) if (composite_op->input_size == -1) composite_op->input_size = input_size; if (composite_op->output_size == -1) composite_op->output_size = output_size; // Note, a size of -1 means no active vector restriction set, so no incompatibility - CeedCheck((input_size == -1 || input_size == composite_op->input_size) && (output_size == -1 || output_size == composite_op->output_size), ceed, - CEED_ERROR_MAJOR, + CeedCheck((input_size == -1 || input_size == composite_op->input_size) && (output_size == -1 || output_size == composite_op->output_size), + CeedOperatorReturnCeed(composite_op), CEED_ERROR_MAJOR, "Sub-operators must have compatible dimensions; composite operator of shape (%" CeedSize_FMT ", %" CeedSize_FMT ") not compatible with sub-operator of " "shape (%" CeedSize_FMT ", %" CeedSize_FMT ")", @@ -1258,7 +1328,7 @@ int CeedCompositeOperatorAddSub(CeedOperator composite_op, CeedOperator sub_op) @ref Backend **/ -int CeedCompositeOperatorGetNumSub(CeedOperator op, CeedInt *num_suboperators) { +int CeedOperatorCompositeGetNumSub(CeedOperator op, CeedInt *num_suboperators) { bool is_composite; CeedCall(CeedOperatorIsComposite(op, &is_composite)); @@ -1277,7 +1347,7 @@ int CeedCompositeOperatorGetNumSub(CeedOperator op, CeedInt *num_suboperators) { @ref Backend **/ -int CeedCompositeOperatorGetSubList(CeedOperator op, CeedOperator **sub_operators) { +int CeedOperatorCompositeGetSubList(CeedOperator op, CeedOperator **sub_operators) { bool is_composite; CeedCall(CeedOperatorIsComposite(op, &is_composite)); @@ -1286,6 +1356,82 @@ int CeedCompositeOperatorGetSubList(CeedOperator op, CeedOperator **sub_operator return CEED_ERROR_SUCCESS; } +/** + @brief Get a sub `CeedOperator` of a composite `CeedOperator` from its name. + + `sub_op` is set to `NULL` if the sub operator is not found. + + Note: Calling this function asserts that setup is complete and sets the `CeedOperator` as immutable. + + @param[in] op Composite `CeedOperator` + @param[in] op_name Name of desired sub `CeedOperator` + @param[out] sub_op Sub `CeedOperator` corresponding to the name + + @return An error code: 0 - success, otherwise - failure + + @ref Advanced +**/ +int CeedOperatorCompositeGetSubByName(CeedOperator op, const char *op_name, CeedOperator *sub_op) { + bool is_composite; + CeedInt num_sub_ops; + CeedOperator *sub_ops; + + CeedCall(CeedOperatorIsComposite(op, &is_composite)); + CeedCheck(is_composite, CeedOperatorReturnCeed(op), CEED_ERROR_MINOR, "Only defined for a composite operator"); + *sub_op = NULL; + CeedCall(CeedOperatorCompositeGetNumSub(op, &num_sub_ops)); + CeedCall(CeedOperatorCompositeGetSubList(op, &sub_ops)); + for (CeedInt i = 0; i < num_sub_ops; i++) { + if (sub_ops[i]->name && !strcmp(op_name, sub_ops[i]->name)) { + *sub_op = sub_ops[i]; + return CEED_ERROR_SUCCESS; + } + } + return CEED_ERROR_SUCCESS; +} + +/** + @brief Set whether the sub-operators of the composite `CeedOperator` must be run sequentially. + + Note: This value currently only affects the GPU `/gpu/cuda/gen` and `/gpu/hip/gen` backends. + + @param[in] op Composite `CeedOperator` + @param[in] is_sequential Flag value to set, if `true`, forces the composite `CeedOperator` to execute sequentially + + @return An error code: 0 - success, otherwise - failure + + @ref Advanced +**/ +int CeedOperatorCompositeSetSequential(CeedOperator op, bool is_sequential) { + bool is_composite; + + CeedCall(CeedOperatorIsComposite(op, &is_composite)); + CeedCheck(is_composite, CeedOperatorReturnCeed(op), CEED_ERROR_MINOR, "Only defined for a composite operator"); + op->is_sequential = is_sequential; + return CEED_ERROR_SUCCESS; +} + +/** + @brief Get whether the sub-operators of the composite `CeedOperator` must be run sequentially. + + Note: This value currently only affects the GPU `/gpu/cuda/gen` and `/gpu/hip/gen` backends. + + @param[in] op Composite `CeedOperator` + @param[out] is_sequential Variable to store sequential status + + @return An error code: 0 - success, otherwise - failure + + @ref Advanced +**/ +int CeedOperatorCompositeIsSequential(CeedOperator op, bool *is_sequential) { + bool is_composite; + + CeedCall(CeedOperatorIsComposite(op, &is_composite)); + CeedCheck(is_composite, CeedOperatorReturnCeed(op), CEED_ERROR_MINOR, "Only defined for a composite operator"); + *is_sequential = op->is_sequential; + return CEED_ERROR_SUCCESS; +} + /** @brief Check if a `CeedOperator` is ready to be used. @@ -1297,19 +1443,17 @@ int CeedCompositeOperatorGetSubList(CeedOperator op, CeedOperator **sub_operator **/ int CeedOperatorCheckReady(CeedOperator op) { bool is_at_points, is_composite; - Ceed ceed; CeedQFunction qf = NULL; if (op->is_interface_setup) return CEED_ERROR_SUCCESS; - CeedCall(CeedOperatorGetCeed(op, &ceed)); CeedCall(CeedOperatorIsAtPoints(op, &is_at_points)); CeedCall(CeedOperatorIsComposite(op, &is_composite)); if (!is_composite) CeedCall(CeedOperatorGetQFunction(op, &qf)); if (is_composite) { CeedInt num_suboperators; - CeedCall(CeedCompositeOperatorGetNumSub(op, &num_suboperators)); + CeedCall(CeedOperatorCompositeGetNumSub(op, &num_suboperators)); if (!num_suboperators) { // Empty operator setup op->input_size = 0; @@ -1317,7 +1461,7 @@ int CeedOperatorCheckReady(CeedOperator op) { } else { CeedOperator *sub_operators; - CeedCall(CeedCompositeOperatorGetSubList(op, &sub_operators)); + CeedCall(CeedOperatorCompositeGetSubList(op, &sub_operators)); for (CeedInt i = 0; i < num_suboperators; i++) { CeedCall(CeedOperatorCheckReady(sub_operators[i])); } @@ -1329,17 +1473,19 @@ int CeedOperatorCheckReady(CeedOperator op) { } else { CeedInt num_input_fields, num_output_fields; - CeedCheck(op->num_fields > 0, ceed, CEED_ERROR_INCOMPLETE, "No operator fields set"); + CeedCheck(op->num_fields > 0, CeedOperatorReturnCeed(op), CEED_ERROR_INCOMPLETE, "No operator fields set"); CeedCall(CeedQFunctionGetFields(qf, &num_input_fields, NULL, &num_output_fields, NULL)); - CeedCheck(op->num_fields == num_input_fields + num_output_fields, ceed, CEED_ERROR_INCOMPLETE, "Not all operator fields set"); - CeedCheck(op->has_restriction, ceed, CEED_ERROR_INCOMPLETE, "At least one restriction required"); - CeedCheck(op->num_qpts > 0 || is_at_points, ceed, CEED_ERROR_INCOMPLETE, + CeedCheck(op->num_fields == num_input_fields + num_output_fields, CeedOperatorReturnCeed(op), CEED_ERROR_INCOMPLETE, + "Not all operator fields set"); + CeedCheck(op->has_restriction, CeedOperatorReturnCeed(op), CEED_ERROR_INCOMPLETE, "At least one restriction required"); + CeedCheck(op->num_qpts > 0 || is_at_points, CeedOperatorReturnCeed(op), CEED_ERROR_INCOMPLETE, "At least one non-collocated CeedBasis is required or the number of quadrature points must be set"); } // Flag as immutable and ready op->is_interface_setup = true; if (qf && qf != CEED_QFUNCTION_NONE) CeedCall(CeedQFunctionSetImmutable(qf)); + CeedCall(CeedQFunctionDestroy(&qf)); if (op->dqf && op->dqf != CEED_QFUNCTION_NONE) CeedCall(CeedQFunctionSetImmutable(op->dqf)); if (op->dqfT && op->dqfT != CEED_QFUNCTION_NONE) CeedCall(CeedQFunctionSetImmutable(op->dqfT)); return CEED_ERROR_SUCCESS; @@ -1369,8 +1515,8 @@ int CeedOperatorGetActiveVectorLengths(CeedOperator op, CeedSize *input_size, Ce CeedInt num_suboperators; CeedOperator *sub_operators; - CeedCall(CeedCompositeOperatorGetNumSub(op, &num_suboperators)); - CeedCall(CeedCompositeOperatorGetSubList(op, &sub_operators)); + CeedCall(CeedOperatorCompositeGetNumSub(op, &num_suboperators)); + CeedCall(CeedOperatorCompositeGetSubList(op, &sub_operators)); for (CeedInt i = 0; i < num_suboperators; i++) { CeedSize sub_input_size, sub_output_size; @@ -1411,7 +1557,10 @@ int CeedOperatorSetQFunctionAssemblyReuse(CeedOperator op, bool reuse_assembly_d CeedCall(CeedOperatorSetQFunctionAssemblyReuse(op->sub_operators[i], reuse_assembly_data)); } } else { - CeedCall(CeedQFunctionAssemblyDataSetReuse(op->qf_assembled, reuse_assembly_data)); + CeedQFunctionAssemblyData data; + + CeedCall(CeedOperatorGetQFunctionAssemblyData(op, &data)); + CeedCall(CeedQFunctionAssemblyDataSetReuse(data, reuse_assembly_data)); } return CEED_ERROR_SUCCESS; } @@ -1434,13 +1583,16 @@ int CeedOperatorSetQFunctionAssemblyDataUpdateNeeded(CeedOperator op, bool needs CeedInt num_suboperators; CeedOperator *sub_operators; - CeedCall(CeedCompositeOperatorGetNumSub(op, &num_suboperators)); - CeedCall(CeedCompositeOperatorGetSubList(op, &sub_operators)); + CeedCall(CeedOperatorCompositeGetNumSub(op, &num_suboperators)); + CeedCall(CeedOperatorCompositeGetSubList(op, &sub_operators)); for (CeedInt i = 0; i < num_suboperators; i++) { CeedCall(CeedOperatorSetQFunctionAssemblyDataUpdateNeeded(sub_operators[i], needs_data_update)); } } else { - CeedCall(CeedQFunctionAssemblyDataSetUpdateNeeded(op->qf_assembled, needs_data_update)); + CeedQFunctionAssemblyData data; + + CeedCall(CeedOperatorGetQFunctionAssemblyData(op, &data)); + CeedCall(CeedQFunctionAssemblyDataSetUpdateNeeded(data, needs_data_update)); } return CEED_ERROR_SUCCESS; } @@ -1469,36 +1621,135 @@ int CeedOperatorSetName(CeedOperator op, const char *name) { } /** - @brief View a `CeedOperator` + @brief Get name of `CeedOperator` - @param[in] op `CeedOperator` to view - @param[in] stream Stream to write; typically `stdout` or a file + @param[in] op `CeedOperator` + @param[in,out] name Address of variable to hold currently set name - @return Error code: 0 - success, otherwise - failure + @return An error code: 0 - success, otherwise - failure @ref User **/ -int CeedOperatorView(CeedOperator op, FILE *stream) { - bool has_name = op->name, is_composite; +int CeedOperatorGetName(CeedOperator op, const char **name) { + if (op->name) { + *name = op->name; + } else if (!op->is_composite) { + CeedQFunction qf; + + CeedCall(CeedOperatorGetQFunction(op, &qf)); + if (qf) CeedCall(CeedQFunctionGetName(qf, name)); + CeedCall(CeedQFunctionDestroy(&qf)); + } + return CEED_ERROR_SUCCESS; +} +/** + @brief Core logic for viewing a `CeedOperator` + + @param[in] op `CeedOperator` to view brief summary + @param[in] stream Stream to write; typically `stdout` or a file + @param[in] is_full Whether to write full operator view or terse + + @return Error code: 0 - success, otherwise - failure + + @ref Developer +**/ +static int CeedOperatorView_Core(CeedOperator op, FILE *stream, bool is_full) { + bool has_name, is_composite, is_at_points; + char *tabs = NULL; + const char *name = NULL; + CeedInt num_tabs = 0; + + CeedCall(CeedOperatorGetName(op, &name)); + has_name = name ? strlen(name) : false; CeedCall(CeedOperatorIsComposite(op, &is_composite)); + CeedCall(CeedOperatorIsAtPoints(op, &is_at_points)); + // Set tabs + CeedCall(CeedOperatorGetNumViewTabs(op, &num_tabs)); + CeedCall(CeedCalloc(CEED_TAB_WIDTH * (num_tabs + is_composite) + 1, &tabs)); + for (CeedInt i = 0; i < CEED_TAB_WIDTH * num_tabs; i++) tabs[i] = ' '; if (is_composite) { CeedInt num_suboperators; CeedOperator *sub_operators; - CeedCall(CeedCompositeOperatorGetNumSub(op, &num_suboperators)); - CeedCall(CeedCompositeOperatorGetSubList(op, &sub_operators)); - fprintf(stream, "Composite CeedOperator%s%s\n", has_name ? " - " : "", has_name ? op->name : ""); - + CeedCall(CeedOperatorCompositeGetNumSub(op, &num_suboperators)); + CeedCall(CeedOperatorCompositeGetSubList(op, &sub_operators)); + fprintf(stream, "%s", tabs); + fprintf(stream, "Composite CeedOperator%s%s\n", has_name ? " - " : "", has_name ? name : ""); + for (CeedInt i = 0; i < CEED_TAB_WIDTH; i++) tabs[CEED_TAB_WIDTH * num_tabs + i] = ' '; for (CeedInt i = 0; i < num_suboperators; i++) { has_name = sub_operators[i]->name; - fprintf(stream, " SubOperator %" CeedInt_FMT "%s%s:\n", i, has_name ? " - " : "", has_name ? sub_operators[i]->name : ""); - CeedCall(CeedOperatorSingleView(sub_operators[i], 1, stream)); + fprintf(stream, "%s", tabs); + fprintf(stream, "SubOperator%s %" CeedInt_FMT "%s%s%s\n", is_at_points ? " AtPoints" : "", i, has_name ? " - " : "", + has_name ? sub_operators[i]->name : "", is_full ? ":" : ""); + if (is_full) CeedCall(CeedOperatorSingleView(sub_operators[i], tabs, stream)); } } else { - fprintf(stream, "CeedOperator%s%s\n", has_name ? " - " : "", has_name ? op->name : ""); - CeedCall(CeedOperatorSingleView(op, 0, stream)); + fprintf(stream, "%s", tabs); + fprintf(stream, "CeedOperator%s%s%s\n", is_at_points ? " AtPoints" : "", has_name ? " - " : "", has_name ? name : ""); + if (is_full) CeedCall(CeedOperatorSingleView(op, tabs, stream)); } + CeedCall(CeedFree(&tabs)); + return CEED_ERROR_SUCCESS; +} + +/** + @brief Set the number of tabs to indent for @ref CeedOperatorView() output + + @param[in] op `CeedOperator` to set the number of view tabs + @param[in] num_tabs Number of view tabs to set + + @return Error code: 0 - success, otherwise - failure + + @ref User +**/ +int CeedOperatorSetNumViewTabs(CeedOperator op, CeedInt num_tabs) { + CeedCall(CeedObjectSetNumViewTabs((CeedObject)op, num_tabs)); + return CEED_ERROR_SUCCESS; +} + +/** + @brief Get the number of tabs to indent for @ref CeedOperatorView() output + + @param[in] op `CeedOperator` to get the number of view tabs + @param[out] num_tabs Number of view tabs + + @return Error code: 0 - success, otherwise - failure + + @ref User +**/ +int CeedOperatorGetNumViewTabs(CeedOperator op, CeedInt *num_tabs) { + CeedCall(CeedObjectGetNumViewTabs((CeedObject)op, num_tabs)); + return CEED_ERROR_SUCCESS; +} + +/** + @brief View a `CeedOperator` + + @param[in] op `CeedOperator` to view + @param[in] stream Stream to write; typically `stdout` or a file + + @return Error code: 0 - success, otherwise - failure + + @ref User +**/ +int CeedOperatorView(CeedOperator op, FILE *stream) { + CeedCall(CeedOperatorView_Core(op, stream, true)); + return CEED_ERROR_SUCCESS; +} + +/** + @brief View a brief summary `CeedOperator` + + @param[in] op `CeedOperator` to view brief summary + @param[in] stream Stream to write; typically `stdout` or a file + + @return Error code: 0 - success, otherwise - failure + + @ref User +**/ +int CeedOperatorViewTerse(CeedOperator op, FILE *stream) { + CeedCall(CeedOperatorView_Core(op, stream, false)); return CEED_ERROR_SUCCESS; } @@ -1513,7 +1764,7 @@ int CeedOperatorView(CeedOperator op, FILE *stream) { @ref Advanced **/ int CeedOperatorGetCeed(CeedOperator op, Ceed *ceed) { - *ceed = CeedOperatorReturnCeed(op); + CeedCall(CeedObjectGetCeed((CeedObject)op, ceed)); return CEED_ERROR_SUCCESS; } @@ -1526,7 +1777,7 @@ int CeedOperatorGetCeed(CeedOperator op, Ceed *ceed) { @ref Advanced **/ -Ceed CeedOperatorReturnCeed(CeedOperator op) { return op->ceed; } +Ceed CeedOperatorReturnCeed(CeedOperator op) { return CeedObjectReturnCeed((CeedObject)op); } /** @brief Get the number of elements associated with a `CeedOperator` @@ -1584,9 +1835,9 @@ int CeedOperatorGetFlopsEstimate(CeedOperator op, CeedSize *flops) { if (is_composite) { CeedInt num_suboperators; - CeedCall(CeedCompositeOperatorGetNumSub(op, &num_suboperators)); + CeedCall(CeedOperatorCompositeGetNumSub(op, &num_suboperators)); CeedOperator *sub_operators; - CeedCall(CeedCompositeOperatorGetSubList(op, &sub_operators)); + CeedCall(CeedOperatorCompositeGetSubList(op, &sub_operators)); // FLOPs for each suboperator for (CeedInt i = 0; i < num_suboperators; i++) { @@ -1596,15 +1847,40 @@ int CeedOperatorGetFlopsEstimate(CeedOperator op, CeedSize *flops) { *flops += suboperator_flops; } } else { - CeedInt num_input_fields, num_output_fields, num_elem = 0; + bool is_at_points; + CeedInt num_input_fields, num_output_fields, num_elem = 0, num_points = 0; CeedQFunction qf; CeedQFunctionField *qf_input_fields, *qf_output_fields; CeedOperatorField *op_input_fields, *op_output_fields; + CeedCall(CeedOperatorGetNumElements(op, &num_elem)); + if (num_elem == 0) return CEED_ERROR_SUCCESS; + CeedCall(CeedOperatorIsAtPoints(op, &is_at_points)); + if (is_at_points) { + CeedMemType mem_type; + CeedElemRestriction rstr_points = NULL; + + CeedCall(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL)); + CeedCall(CeedGetPreferredMemType(CeedOperatorReturnCeed(op), &mem_type)); + if (mem_type == CEED_MEM_DEVICE) { + // Device backends pad out to the same number of points per element + CeedCall(CeedElemRestrictionGetMaxPointsInElement(rstr_points, &num_points)); + } else { + num_points = 0; + for (CeedInt i = 0; i < num_elem; i++) { + CeedInt points_in_elem = 0; + + CeedCall(CeedElemRestrictionGetNumPointsInElement(rstr_points, i, &points_in_elem)); + num_points += points_in_elem; + } + num_points = num_points / num_elem + (num_points % num_elem > 0); + } + CeedCall(CeedElemRestrictionDestroy(&rstr_points)); + } CeedCall(CeedOperatorGetQFunction(op, &qf)); CeedCall(CeedQFunctionGetFields(qf, &num_input_fields, &qf_input_fields, &num_output_fields, &qf_output_fields)); + CeedCall(CeedQFunctionDestroy(&qf)); CeedCall(CeedOperatorGetFields(op, NULL, &op_input_fields, NULL, &op_output_fields)); - CeedCall(CeedOperatorGetNumElements(op, &num_elem)); // Input FLOPs for (CeedInt i = 0; i < num_input_fields; i++) { @@ -1619,12 +1895,15 @@ int CeedOperatorGetFlopsEstimate(CeedOperator op, CeedSize *flops) { CeedCall(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr)); CeedCall(CeedElemRestrictionGetFlopsEstimate(rstr, CEED_NOTRANSPOSE, &rstr_flops)); + CeedCall(CeedElemRestrictionDestroy(&rstr)); *flops += rstr_flops; CeedCall(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); CeedCall(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); - CeedCall(CeedBasisGetFlopsEstimate(basis, CEED_NOTRANSPOSE, eval_mode, &basis_flops)); + CeedCall(CeedBasisGetFlopsEstimate(basis, CEED_NOTRANSPOSE, eval_mode, is_at_points, num_points, &basis_flops)); + CeedCall(CeedBasisDestroy(&basis)); *flops += basis_flops * num_elem; } + CeedCall(CeedVectorDestroy(&vec)); } // QF FLOPs { @@ -1632,9 +1911,11 @@ int CeedOperatorGetFlopsEstimate(CeedOperator op, CeedSize *flops) { CeedSize qf_flops; CeedQFunction qf; - CeedCall(CeedOperatorGetNumQuadraturePoints(op, &num_qpts)); + if (is_at_points) num_qpts = num_points; + else CeedCall(CeedOperatorGetNumQuadraturePoints(op, &num_qpts)); CeedCall(CeedOperatorGetQFunction(op, &qf)); CeedCall(CeedQFunctionGetFlopsEstimate(qf, &qf_flops)); + CeedCall(CeedQFunctionDestroy(&qf)); CeedCheck(qf_flops > -1, CeedOperatorReturnCeed(op), CEED_ERROR_INCOMPLETE, "Must set CeedQFunction FLOPs estimate with CeedQFunctionSetUserFlopsEstimate"); *flops += num_elem * num_qpts * qf_flops; @@ -1653,12 +1934,15 @@ int CeedOperatorGetFlopsEstimate(CeedOperator op, CeedSize *flops) { CeedCall(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &rstr)); CeedCall(CeedElemRestrictionGetFlopsEstimate(rstr, CEED_TRANSPOSE, &rstr_flops)); + CeedCall(CeedElemRestrictionDestroy(&rstr)); *flops += rstr_flops; CeedCall(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); CeedCall(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); - CeedCall(CeedBasisGetFlopsEstimate(basis, CEED_TRANSPOSE, eval_mode, &basis_flops)); + CeedCall(CeedBasisGetFlopsEstimate(basis, CEED_TRANSPOSE, eval_mode, is_at_points, num_points, &basis_flops)); + CeedCall(CeedBasisDestroy(&basis)); *flops += basis_flops * num_elem; } + CeedCall(CeedVectorDestroy(&vec)); } } return CEED_ERROR_SUCCESS; @@ -1688,6 +1972,8 @@ int CeedOperatorGetContext(CeedOperator op, CeedQFunctionContext *ctx) { CeedCheck(!is_composite, CeedOperatorReturnCeed(op), CEED_ERROR_INCOMPATIBLE, "Cannot retrieve CeedQFunctionContext for composite operator"); CeedCall(CeedOperatorGetQFunction(op, &qf)); CeedCall(CeedQFunctionGetInnerContext(qf, &qf_ctx)); + CeedCall(CeedQFunctionDestroy(&qf)); + *ctx = NULL; if (qf_ctx) CeedCall(CeedQFunctionContextReferenceCopy(qf_ctx, ctx)); return CEED_ERROR_SUCCESS; } @@ -1726,8 +2012,8 @@ int CeedOperatorGetContextFieldLabel(CeedOperator op, const char *field_name, Ce CeedContextFieldLabel new_field_label; CeedCall(CeedCalloc(1, &new_field_label)); - CeedCall(CeedCompositeOperatorGetNumSub(op, &num_sub)); - CeedCall(CeedCompositeOperatorGetSubList(op, &sub_operators)); + CeedCall(CeedOperatorCompositeGetNumSub(op, &num_sub)); + CeedCall(CeedOperatorCompositeGetSubList(op, &sub_operators)); CeedCall(CeedCalloc(num_sub, &new_field_label->sub_labels)); new_field_label->num_sub_labels = num_sub; @@ -1780,6 +2066,7 @@ int CeedOperatorGetContextFieldLabel(CeedOperator op, const char *field_name, Ce // Single, non-composite operator CeedCall(CeedOperatorGetQFunction(op, &qf)); CeedCall(CeedQFunctionGetInnerContext(qf, &ctx)); + CeedCall(CeedQFunctionDestroy(&qf)); if (ctx) { CeedCall(CeedQFunctionContextGetFieldLabel(ctx, field_name, field_label)); } else { @@ -1961,7 +2248,7 @@ int CeedOperatorRestoreContextBooleanRead(CeedOperator op, CeedContextFieldLabel This computes the action of the operator on the specified (active) input, yielding its (active) output. All inputs and outputs must be specified using @ref CeedOperatorSetField(). - Note: Calling this function asserts that setup is complete and sets the `CeedOperator` as immutable. + @note Calling this function asserts that setup is complete and sets the `CeedOperator` as immutable. @param[in] op `CeedOperator` to apply @param[in] in `CeedVector` containing input state or @ref CEED_VECTOR_NONE if there are no active inputs @@ -1978,58 +2265,19 @@ int CeedOperatorApply(CeedOperator op, CeedVector in, CeedVector out, CeedReques CeedCall(CeedOperatorCheckReady(op)); CeedCall(CeedOperatorIsComposite(op, &is_composite)); - if (is_composite) { + if (is_composite && op->ApplyComposite) { // Composite Operator - if (op->ApplyComposite) { - CeedCall(op->ApplyComposite(op, in, out, request)); - } else { - CeedInt num_suboperators; - CeedOperator *sub_operators; - - CeedCall(CeedCompositeOperatorGetNumSub(op, &num_suboperators)); - CeedCall(CeedCompositeOperatorGetSubList(op, &sub_operators)); - - // Zero all output vectors - if (out != CEED_VECTOR_NONE) CeedCall(CeedVectorSetValue(out, 0.0)); - for (CeedInt i = 0; i < num_suboperators; i++) { - CeedInt num_output_fields; - CeedOperatorField *output_fields; - - CeedCall(CeedOperatorGetFields(sub_operators[i], NULL, NULL, &num_output_fields, &output_fields)); - for (CeedInt j = 0; j < num_output_fields; j++) { - CeedVector vec; - - CeedCall(CeedOperatorFieldGetVector(output_fields[j], &vec)); - if (vec != CEED_VECTOR_ACTIVE && vec != CEED_VECTOR_NONE) { - CeedCall(CeedVectorSetValue(vec, 0.0)); - } - } - } - // Apply - for (CeedInt i = 0; i < num_suboperators; i++) { - CeedCall(CeedOperatorApplyAdd(sub_operators[i], in, out, request)); - } - } - } else { + CeedCall(op->ApplyComposite(op, in, out, request)); + } else if (!is_composite && op->Apply) { // Standard Operator - if (op->Apply) { - CeedCall(op->Apply(op, in, out, request)); - } else { - CeedInt num_output_fields; - CeedOperatorField *output_fields; - - CeedCall(CeedOperatorGetFields(op, NULL, NULL, &num_output_fields, &output_fields)); - // Zero all output vectors - for (CeedInt i = 0; i < num_output_fields; i++) { - CeedVector vec; + CeedCall(op->Apply(op, in, out, request)); + } else { + // Standard or composite, default to zeroing out and calling ApplyAddActive + // Zero active output + if (out != CEED_VECTOR_NONE) CeedCall(CeedVectorSetValue(out, 0.0)); - CeedCall(CeedOperatorFieldGetVector(output_fields[i], &vec)); - if (vec == CEED_VECTOR_ACTIVE) vec = out; - if (vec != CEED_VECTOR_NONE) CeedCall(CeedVectorSetValue(vec, 0.0)); - } - // Apply - if (op->num_elem > 0) CeedCall(op->ApplyAdd(op, in, out, request)); - } + // ApplyAddActive + CeedCall(CeedOperatorApplyAddActive(op, in, out, request)); } return CEED_ERROR_SUCCESS; } @@ -2040,6 +2288,10 @@ int CeedOperatorApply(CeedOperator op, CeedVector in, CeedVector out, CeedReques This computes the action of the operator on the specified (active) input, yielding its (active) output. All inputs and outputs must be specified using @ref CeedOperatorSetField(). + @note Calling this function asserts that setup is complete and sets the `CeedOperator` as immutable. + @warning This function adds into ALL outputs, including passive outputs. To only add into the active output, use `CeedOperatorApplyAddActive()`. + @see `CeedOperatorApplyAddActive()` + @param[in] op `CeedOperator` to apply @param[in] in `CeedVector` containing input state or @ref CEED_VECTOR_NONE if there are no active inputs @param[out] out `CeedVector` to sum in result of applying operator (must be distinct from `in`) or @ref CEED_VECTOR_NONE if there are no active outputs @@ -2063,8 +2315,8 @@ int CeedOperatorApplyAdd(CeedOperator op, CeedVector in, CeedVector out, CeedReq CeedInt num_suboperators; CeedOperator *sub_operators; - CeedCall(CeedCompositeOperatorGetNumSub(op, &num_suboperators)); - CeedCall(CeedCompositeOperatorGetSubList(op, &sub_operators)); + CeedCall(CeedOperatorCompositeGetNumSub(op, &num_suboperators)); + CeedCall(CeedOperatorCompositeGetSubList(op, &sub_operators)); for (CeedInt i = 0; i < num_suboperators; i++) { CeedCall(CeedOperatorApplyAdd(sub_operators[i], in, out, request)); } @@ -2076,6 +2328,102 @@ int CeedOperatorApplyAdd(CeedOperator op, CeedVector in, CeedVector out, CeedReq return CEED_ERROR_SUCCESS; } +/** + @brief Apply `CeedOperator` to a `CeedVector` and add result to output `CeedVector`. Only sums into active outputs, overwrites passive outputs. + + This computes the action of the operator on the specified (active) input, yielding its (active) output. + All inputs and outputs must be specified using @ref CeedOperatorSetField(). + + @note Calling this function asserts that setup is complete and sets the `CeedOperator` as immutable. + + @param[in] op `CeedOperator` to apply + @param[in] in `CeedVector` containing input state or @ref CEED_VECTOR_NONE if there are no active inputs + @param[out] out `CeedVector` to sum in result of applying operator (must be distinct from `in`) or @ref CEED_VECTOR_NONE if there are no active outputs + @param[in] request Address of @ref CeedRequest for non-blocking completion, else @ref CEED_REQUEST_IMMEDIATE + + @return An error code: 0 - success, otherwise - failure + + @ref User +**/ +int CeedOperatorApplyAddActive(CeedOperator op, CeedVector in, CeedVector out, CeedRequest *request) { + bool is_composite; + + CeedCall(CeedOperatorCheckReady(op)); + + CeedCall(CeedOperatorIsComposite(op, &is_composite)); + if (is_composite) { + // Composite Operator + CeedInt num_suboperators; + CeedOperator *sub_operators; + + CeedCall(CeedOperatorCompositeGetNumSub(op, &num_suboperators)); + CeedCall(CeedOperatorCompositeGetSubList(op, &sub_operators)); + + // Zero all output vectors + for (CeedInt i = 0; i < num_suboperators; i++) { + CeedInt num_output_fields; + CeedOperatorField *output_fields; + + CeedCall(CeedOperatorGetFields(sub_operators[i], NULL, NULL, &num_output_fields, &output_fields)); + for (CeedInt j = 0; j < num_output_fields; j++) { + CeedVector vec; + + CeedCall(CeedOperatorFieldGetVector(output_fields[j], &vec)); + if (vec != CEED_VECTOR_ACTIVE && vec != CEED_VECTOR_NONE) CeedCall(CeedVectorSetValue(vec, 0.0)); + CeedCall(CeedVectorDestroy(&vec)); + } + } + // ApplyAdd + CeedCall(CeedOperatorApplyAdd(op, in, out, request)); + } else { + // Standard Operator + CeedInt num_output_fields; + CeedOperatorField *output_fields; + + CeedCall(CeedOperatorGetFields(op, NULL, NULL, &num_output_fields, &output_fields)); + // Zero all output vectors + for (CeedInt i = 0; i < num_output_fields; i++) { + CeedVector vec; + + CeedCall(CeedOperatorFieldGetVector(output_fields[i], &vec)); + if (vec != CEED_VECTOR_ACTIVE && vec != CEED_VECTOR_NONE) CeedCall(CeedVectorSetValue(vec, 0.0)); + CeedCall(CeedVectorDestroy(&vec)); + } + // ApplyAdd + CeedCall(CeedOperatorApplyAdd(op, in, out, request)); + } + return CEED_ERROR_SUCCESS; +} + +/** + @brief Destroy temporary assembly data associated with a `CeedOperator` + + @param[in,out] op `CeedOperator` whose assembly data to destroy + + @return An error code: 0 - success, otherwise - failure + + @ref User +**/ +int CeedOperatorAssemblyDataStrip(CeedOperator op) { + bool is_composite; + + CeedCall(CeedQFunctionAssemblyDataDestroy(&op->qf_assembled)); + CeedCall(CeedOperatorAssemblyDataDestroy(&op->op_assembled)); + CeedCall(CeedOperatorIsComposite(op, &is_composite)); + if (is_composite) { + CeedInt num_suboperators; + CeedOperator *sub_operators; + + CeedCall(CeedOperatorCompositeGetNumSub(op, &num_suboperators)); + CeedCall(CeedOperatorCompositeGetSubList(op, &sub_operators)); + for (CeedInt i = 0; i < num_suboperators; i++) { + CeedCall(CeedQFunctionAssemblyDataDestroy(&sub_operators[i]->qf_assembled)); + CeedCall(CeedOperatorAssemblyDataDestroy(&sub_operators[i]->op_assembled)); + } + } + return CEED_ERROR_SUCCESS; +} + /** @brief Destroy a `CeedOperator` @@ -2086,12 +2434,14 @@ int CeedOperatorApplyAdd(CeedOperator op, CeedVector in, CeedVector out, CeedReq @ref User **/ int CeedOperatorDestroy(CeedOperator *op) { - if (!*op || --(*op)->ref_count > 0) { + if (!*op || CeedObjectDereference((CeedObject)*op) > 0) { *op = NULL; return CEED_ERROR_SUCCESS; } - if ((*op)->Destroy) CeedCall((*op)->Destroy(*op)); - CeedCall(CeedDestroy(&(*op)->ceed)); + // Backend destroy + if ((*op)->Destroy) { + CeedCall((*op)->Destroy(*op)); + } // Free fields for (CeedInt i = 0; i < (*op)->num_fields; i++) { if ((*op)->input_fields[i]) { @@ -2121,16 +2471,21 @@ int CeedOperatorDestroy(CeedOperator *op) { CeedCall(CeedFree(&(*op)->output_fields[i])); } } - // AtPoints data + CeedCall(CeedFree(&(*op)->input_fields)); + CeedCall(CeedFree(&(*op)->output_fields)); + // Destroy AtPoints data CeedCall(CeedVectorDestroy(&(*op)->point_coords)); CeedCall(CeedElemRestrictionDestroy(&(*op)->rstr_points)); CeedCall(CeedElemRestrictionDestroy(&(*op)->first_points_rstr)); + // Destroy assembly data (must happen before destroying sub_operators) + CeedCall(CeedOperatorAssemblyDataStrip(*op)); // Destroy sub_operators for (CeedInt i = 0; i < (*op)->num_suboperators; i++) { if ((*op)->sub_operators[i]) { CeedCall(CeedOperatorDestroy(&(*op)->sub_operators[i])); } } + CeedCall(CeedFree(&(*op)->sub_operators)); CeedCall(CeedQFunctionDestroy(&(*op)->qf)); CeedCall(CeedQFunctionDestroy(&(*op)->dqf)); CeedCall(CeedQFunctionDestroy(&(*op)->dqfT)); @@ -2146,14 +2501,8 @@ int CeedOperatorDestroy(CeedOperator *op) { // Destroy fallback CeedCall(CeedOperatorDestroy(&(*op)->op_fallback)); - // Destroy assembly data - CeedCall(CeedQFunctionAssemblyDataDestroy(&(*op)->qf_assembled)); - CeedCall(CeedOperatorAssemblyDataDestroy(&(*op)->op_assembled)); - - CeedCall(CeedFree(&(*op)->input_fields)); - CeedCall(CeedFree(&(*op)->output_fields)); - CeedCall(CeedFree(&(*op)->sub_operators)); CeedCall(CeedFree(&(*op)->name)); + CeedCall(CeedObjectDestroy_Private(&(*op)->obj)); CeedCall(CeedFree(op)); return CEED_ERROR_SUCCESS; } diff --git a/interface/ceed-preconditioning.c b/interface/ceed-preconditioning.c index 88255ced0a..dd83302f2d 100644 --- a/interface/ceed-preconditioning.c +++ b/interface/ceed-preconditioning.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -37,22 +37,22 @@ static int CeedQFunctionCreateFallback(Ceed fallback_ceed, CeedQFunction qf, CeedQFunction *qf_fallback) { char *source_path_with_name = NULL; CeedInt num_input_fields, num_output_fields; - Ceed ceed; CeedQFunctionField *input_fields, *output_fields; // Check if NULL qf passed in if (!qf) return CEED_ERROR_SUCCESS; - CeedCall(CeedQFunctionGetCeed(qf, &ceed)); - CeedDebug256(ceed, 1, "---------- CeedOperator Fallback ----------\n"); - CeedDebug(ceed, "Creating fallback CeedQFunction\n"); + CeedDebug(CeedQFunctionReturnCeed(qf), "Creating fallback CeedQFunction\n"); if (qf->source_path) { size_t path_len = strlen(qf->source_path), name_len = strlen(qf->kernel_name); + CeedCall(CeedCalloc(path_len + name_len + 2, &source_path_with_name)); memcpy(source_path_with_name, qf->source_path, path_len); memcpy(&source_path_with_name[path_len], ":", 1); memcpy(&source_path_with_name[path_len + 1], qf->kernel_name, name_len); + } else if (qf->user_source) { + CeedCall(CeedStringAllocCopy(qf->user_source, &source_path_with_name)); } else { CeedCall(CeedCalloc(1, &source_path_with_name)); } @@ -70,6 +70,7 @@ static int CeedQFunctionCreateFallback(Ceed fallback_ceed, CeedQFunction qf, Cee CeedCall(CeedQFunctionGetContext(qf, &ctx)); CeedCall(CeedQFunctionSetContext(*qf_fallback, ctx)); + CeedCall(CeedQFunctionContextDestroy(&ctx)); } CeedCall(CeedQFunctionGetFields(qf, &num_input_fields, &input_fields, &num_output_fields, &output_fields)); for (CeedInt i = 0; i < num_input_fields; i++) { @@ -112,10 +113,10 @@ static int CeedOperatorCreateFallback(CeedOperator op) { // Fallback Ceed CeedCall(CeedOperatorGetCeed(op, &ceed)); CeedCall(CeedGetOperatorFallbackCeed(ceed, &ceed_fallback)); + CeedCall(CeedDestroy(&ceed)); if (!ceed_fallback) return CEED_ERROR_SUCCESS; - CeedDebug256(ceed, 1, "---------- CeedOperator Fallback ----------\n"); - CeedDebug(ceed, "Creating fallback CeedOperator\n"); + CeedDebug(CeedOperatorReturnCeed(op), "Creating fallback CeedOperator\n"); // Clone Op CeedCall(CeedOperatorIsComposite(op, &is_composite)); @@ -123,16 +124,17 @@ static int CeedOperatorCreateFallback(CeedOperator op) { CeedInt num_suboperators; CeedOperator *sub_operators; - CeedCall(CeedCompositeOperatorCreate(ceed_fallback, &op_fallback)); - CeedCall(CeedCompositeOperatorGetNumSub(op, &num_suboperators)); - CeedCall(CeedCompositeOperatorGetSubList(op, &sub_operators)); + CeedCall(CeedOperatorCreateComposite(ceed_fallback, &op_fallback)); + CeedCall(CeedOperatorCompositeGetNumSub(op, &num_suboperators)); + CeedCall(CeedOperatorCompositeGetSubList(op, &sub_operators)); for (CeedInt i = 0; i < num_suboperators; i++) { CeedOperator op_sub_fallback; CeedCall(CeedOperatorGetFallback(sub_operators[i], &op_sub_fallback)); - CeedCall(CeedCompositeOperatorAddSub(op_fallback, op_sub_fallback)); + CeedCall(CeedOperatorCompositeAddSub(op_fallback, op_sub_fallback)); } } else { + bool is_at_points = false; CeedInt num_input_fields, num_output_fields; CeedQFunction qf_fallback = NULL, dqf_fallback = NULL, dqfT_fallback = NULL; CeedOperatorField *input_fields, *output_fields; @@ -140,7 +142,19 @@ static int CeedOperatorCreateFallback(CeedOperator op) { CeedCall(CeedQFunctionCreateFallback(ceed_fallback, op->qf, &qf_fallback)); CeedCall(CeedQFunctionCreateFallback(ceed_fallback, op->dqf, &dqf_fallback)); CeedCall(CeedQFunctionCreateFallback(ceed_fallback, op->dqfT, &dqfT_fallback)); - CeedCall(CeedOperatorCreate(ceed_fallback, qf_fallback, dqf_fallback, dqfT_fallback, &op_fallback)); + CeedCall(CeedOperatorIsAtPoints(op, &is_at_points)); + if (is_at_points) { + CeedVector points; + CeedElemRestriction rstr_points; + + CeedCall(CeedOperatorCreateAtPoints(ceed_fallback, qf_fallback, dqf_fallback, dqfT_fallback, &op_fallback)); + CeedCall(CeedOperatorAtPointsGetPoints(op, &rstr_points, &points)); + CeedCall(CeedOperatorAtPointsSetPoints(op_fallback, rstr_points, points)); + CeedCall(CeedVectorDestroy(&points)); + CeedCall(CeedElemRestrictionDestroy(&rstr_points)); + } else { + CeedCall(CeedOperatorCreate(ceed_fallback, qf_fallback, dqf_fallback, dqfT_fallback, &op_fallback)); + } CeedCall(CeedOperatorGetFields(op, &num_input_fields, &input_fields, &num_output_fields, &output_fields)); for (CeedInt i = 0; i < num_input_fields; i++) { const char *field_name; @@ -150,6 +164,9 @@ static int CeedOperatorCreateFallback(CeedOperator op) { CeedCall(CeedOperatorFieldGetData(input_fields[i], &field_name, &rstr, &basis, &vec)); CeedCall(CeedOperatorSetField(op_fallback, field_name, rstr, basis, vec)); + CeedCall(CeedVectorDestroy(&vec)); + CeedCall(CeedElemRestrictionDestroy(&rstr)); + CeedCall(CeedBasisDestroy(&basis)); } for (CeedInt i = 0; i < num_output_fields; i++) { const char *field_name; @@ -159,8 +176,16 @@ static int CeedOperatorCreateFallback(CeedOperator op) { CeedCall(CeedOperatorFieldGetData(output_fields[i], &field_name, &rstr, &basis, &vec)); CeedCall(CeedOperatorSetField(op_fallback, field_name, rstr, basis, vec)); + CeedCall(CeedVectorDestroy(&vec)); + CeedCall(CeedElemRestrictionDestroy(&rstr)); + CeedCall(CeedBasisDestroy(&basis)); + } + { + CeedQFunctionAssemblyData data; + + CeedCall(CeedOperatorGetQFunctionAssemblyData(op, &data)); + CeedCall(CeedQFunctionAssemblyDataReferenceCopy(data, &op_fallback->qf_assembled)); } - CeedCall(CeedQFunctionAssemblyDataReferenceCopy(op->qf_assembled, &op_fallback->qf_assembled)); // Cleanup CeedCall(CeedQFunctionDestroy(&qf_fallback)); CeedCall(CeedQFunctionDestroy(&dqf_fallback)); @@ -172,6 +197,7 @@ static int CeedOperatorCreateFallback(CeedOperator op) { // The op holds the only reference to op_fallback and is responsible for deleting itself and op_fallback. op->op_fallback = op_fallback; op_fallback->op_fallback_parent = op; + CeedCall(CeedDestroy(&ceed_fallback)); return CEED_ERROR_SUCCESS; } @@ -187,14 +213,12 @@ static int CeedOperatorCreateFallback(CeedOperator op) { @ref Developer **/ -static inline int CeedSingleOperatorLinearAssembleAddDiagonal_Mesh(CeedOperator op, CeedRequest *request, const bool is_point_block, +static inline int CeedOperatorLinearAssembleAddDiagonalSingle_Mesh(CeedOperator op, CeedRequest *request, const bool is_point_block, CeedVector assembled) { - Ceed ceed; bool is_composite; - CeedCall(CeedOperatorGetCeed(op, &ceed)); CeedCall(CeedOperatorIsComposite(op, &is_composite)); - CeedCheck(!is_composite, ceed, CEED_ERROR_UNSUPPORTED, "Composite operator not supported"); + CeedCheck(!is_composite, CeedOperatorReturnCeed(op), CEED_ERROR_UNSUPPORTED, "Composite operator not supported"); // Assemble QFunction CeedInt layout_qf[3]; @@ -251,7 +275,7 @@ static inline int CeedSingleOperatorLinearAssembleAddDiagonal_Mesh(CeedOperator continue; } // No matching output basis found } - CeedCheck(active_elem_rstrs_in[b_in] == active_elem_rstrs_out[b_out], ceed, CEED_ERROR_UNSUPPORTED, + CeedCheck(active_elem_rstrs_in[b_in] == active_elem_rstrs_out[b_out], CeedOperatorReturnCeed(op), CEED_ERROR_UNSUPPORTED, "Cannot assemble operator diagonal with different input and output active element restrictions"); // Assemble point block diagonal restriction, if needed @@ -374,15 +398,13 @@ static inline int CeedSingleOperatorLinearAssembleAddDiagonal_Mesh(CeedOperator @ref Developer **/ -static inline int CeedSingleOperatorLinearAssembleAddDiagonal(CeedOperator op, CeedRequest *request, const bool is_point_block, +static inline int CeedOperatorLinearAssembleAddDiagonalSingle(CeedOperator op, CeedRequest *request, const bool is_point_block, CeedVector assembled) { - Ceed ceed; bool is_at_points; - CeedCall(CeedOperatorGetCeed(op, &ceed)); CeedCall(CeedOperatorIsAtPoints(op, &is_at_points)); - CeedCheck(!is_at_points, ceed, CEED_ERROR_UNSUPPORTED, "AtPoints operator not supported"); - CeedCall(CeedSingleOperatorLinearAssembleAddDiagonal_Mesh(op, request, is_point_block, assembled)); + CeedCheck(!is_at_points, CeedOperatorReturnCeed(op), CEED_ERROR_UNSUPPORTED, "AtPoints operator not supported"); + CeedCall(CeedOperatorLinearAssembleAddDiagonalSingle_Mesh(op, request, is_point_block, assembled)); return CEED_ERROR_SUCCESS; } @@ -398,13 +420,13 @@ static inline int CeedSingleOperatorLinearAssembleAddDiagonal(CeedOperator op, C @ref Developer **/ -static inline int CeedCompositeOperatorLinearAssembleAddDiagonal(CeedOperator op, CeedRequest *request, const bool is_point_block, +static inline int CeedOperatorLinearAssembleAddDiagonalComposite(CeedOperator op, CeedRequest *request, const bool is_point_block, CeedVector assembled) { CeedInt num_sub; CeedOperator *suboperators; - CeedCall(CeedCompositeOperatorGetNumSub(op, &num_sub)); - CeedCall(CeedCompositeOperatorGetSubList(op, &suboperators)); + CeedCall(CeedOperatorCompositeGetNumSub(op, &num_sub)); + CeedCall(CeedOperatorCompositeGetSubList(op, &suboperators)); for (CeedInt i = 0; i < num_sub; i++) { if (is_point_block) { CeedCall(CeedOperatorLinearAssembleAddPointBlockDiagonal(suboperators[i], assembled, request)); @@ -429,7 +451,7 @@ static inline int CeedCompositeOperatorLinearAssembleAddDiagonal(CeedOperator op @ref Developer **/ -static int CeedSingleOperatorAssembleSymbolic(CeedOperator op, CeedInt offset, CeedInt *rows, CeedInt *cols) { +static int CeedOperatorAssembleSymbolicSingle(CeedOperator op, CeedInt offset, CeedInt *rows, CeedInt *cols) { Ceed ceed; bool is_composite; CeedSize num_nodes_in, num_nodes_out, local_num_entries, count = 0; @@ -440,8 +462,8 @@ static int CeedSingleOperatorAssembleSymbolic(CeedOperator op, CeedInt offset, C CeedVector index_vec_in, index_vec_out, elem_dof_in, elem_dof_out; CeedElemRestriction elem_rstr_in, elem_rstr_out, index_elem_rstr_in, index_elem_rstr_out; - CeedCall(CeedOperatorGetCeed(op, &ceed)); CeedCall(CeedOperatorIsComposite(op, &is_composite)); + CeedCall(CeedOperatorGetCeed(op, &ceed)); CeedCheck(!is_composite, ceed, CEED_ERROR_UNSUPPORTED, "Composite operator not supported"); CeedCall(CeedOperatorGetActiveVectorLengths(op, &num_nodes_in, &num_nodes_out)); @@ -454,7 +476,7 @@ static int CeedSingleOperatorAssembleSymbolic(CeedOperator op, CeedInt offset, C // Determine elem_dof relation for input CeedCall(CeedVectorCreate(ceed, num_nodes_in, &index_vec_in)); CeedCall(CeedVectorGetArrayWrite(index_vec_in, CEED_MEM_HOST, &array)); - for (CeedInt i = 0; i < num_nodes_in; i++) array[i] = i; + for (CeedSize i = 0; i < num_nodes_in; i++) array[i] = i; CeedCall(CeedVectorRestoreArray(index_vec_in, &array)); CeedCall(CeedVectorCreate(ceed, num_elem_in * elem_size_in * num_comp_in, &elem_dof_in)); CeedCall(CeedVectorSetValue(elem_dof_in, 0.0)); @@ -467,7 +489,9 @@ static int CeedSingleOperatorAssembleSymbolic(CeedOperator op, CeedInt offset, C if (elem_rstr_in != elem_rstr_out) { CeedCall(CeedElemRestrictionGetNumElements(elem_rstr_out, &num_elem_out)); CeedCheck(num_elem_in == num_elem_out, ceed, CEED_ERROR_UNSUPPORTED, - "Active input and output operator restrictions must have the same number of elements"); + "Active input and output operator restrictions must have the same number of elements." + " Input has %" CeedInt_FMT " elements; output has %" CeedInt_FMT "elements.", + num_elem_in, num_elem_out); CeedCall(CeedElemRestrictionGetElementSize(elem_rstr_out, &elem_size_out)); CeedCall(CeedElemRestrictionGetNumComponents(elem_rstr_out, &num_comp_out)); CeedCall(CeedElemRestrictionGetELayout(elem_rstr_out, layout_er_out)); @@ -475,7 +499,7 @@ static int CeedSingleOperatorAssembleSymbolic(CeedOperator op, CeedInt offset, C // Determine elem_dof relation for output CeedCall(CeedVectorCreate(ceed, num_nodes_out, &index_vec_out)); CeedCall(CeedVectorGetArrayWrite(index_vec_out, CEED_MEM_HOST, &array)); - for (CeedInt i = 0; i < num_nodes_out; i++) array[i] = i; + for (CeedSize i = 0; i < num_nodes_out; i++) array[i] = i; CeedCall(CeedVectorRestoreArray(index_vec_out, &array)); CeedCall(CeedVectorCreate(ceed, num_elem_out * elem_size_out * num_comp_out, &elem_dof_out)); CeedCall(CeedVectorSetValue(elem_dof_out, 0.0)); @@ -493,7 +517,7 @@ static int CeedSingleOperatorAssembleSymbolic(CeedOperator op, CeedInt offset, C layout_er_out[2] = layout_er_in[2]; elem_dof_a_out = elem_dof_a_in; } - local_num_entries = elem_size_out * num_comp_out * elem_size_in * num_comp_in * num_elem_in; + local_num_entries = (CeedSize)elem_size_out * num_comp_out * elem_size_in * num_comp_in * num_elem_in; // Determine i, j locations for element matrices for (CeedInt e = 0; e < num_elem_in; e++) { @@ -521,9 +545,116 @@ static int CeedSingleOperatorAssembleSymbolic(CeedOperator op, CeedInt offset, C CeedCall(CeedVectorRestoreArrayRead(elem_dof_out, &elem_dof_a_out)); CeedCall(CeedVectorDestroy(&elem_dof_out)); } + CeedCall(CeedElemRestrictionDestroy(&elem_rstr_in)); + CeedCall(CeedElemRestrictionDestroy(&elem_rstr_out)); + CeedCall(CeedDestroy(&ceed)); + return CEED_ERROR_SUCCESS; +} + +/** + @brief Core logic to assemble `CeedQFunction` and store result internally. + + Return copied references of stored data to the caller. + Caller is responsible for ownership and destruction of the copied references. + See also @ref CeedOperatorLinearAssembleQFunction(). + + Note: If the value of `assembled` or `rstr` passed to this function are non-`NULL` , then it is assumed that they hold valid pointers. + These objects will be destroyed if `*assembled` or `*rstr` is the only reference to the object. + + @param[in] op `CeedOperator` to assemble `CeedQFunction` + @param[in] use_parent Boolean flag to check for fallback parent implementation + @param[out] assembled `CeedVector` to store assembled `CeedQFunction` at quadrature points + @param[out] rstr `CeedElemRestriction` for `CeedVector` containing assembled `CeedQFunction` + @param[in] request Address of @ref CeedRequest for non-blocking completion, else @ref CEED_REQUEST_IMMEDIATE + + @return An error code: 0 - success, otherwise - failure + + @ref User +**/ +static int CeedOperatorLinearAssembleQFunctionBuildOrUpdate_Core(CeedOperator op, bool use_parent, CeedVector *assembled, CeedElemRestriction *rstr, + CeedRequest *request) { + int (*LinearAssembleQFunctionUpdate)(CeedOperator, CeedVector, CeedElemRestriction, CeedRequest *) = NULL; + CeedOperator op_assemble = NULL; + CeedOperator op_fallback_parent = NULL; + + CeedCall(CeedOperatorCheckReady(op)); + + // Determine if fallback parent or operator has implementation + CeedCall(CeedOperatorGetFallbackParent(op, &op_fallback_parent)); + if (op_fallback_parent && use_parent && op_fallback_parent->LinearAssembleQFunctionUpdate) { + // -- Backend version for op fallback parent is faster, if it exists + CeedDebug(CeedOperatorReturnCeed(op), "Using fallback parent for CeedOperatorLinearAssembleQFunctionBuildOrUpdate\n"); + LinearAssembleQFunctionUpdate = op_fallback_parent->LinearAssembleQFunctionUpdate; + op_assemble = op_fallback_parent; + } else if (op->LinearAssembleQFunctionUpdate) { + // -- Backend version for op + LinearAssembleQFunctionUpdate = op->LinearAssembleQFunctionUpdate; + op_assemble = op; + } + + // Assemble QFunction + if (LinearAssembleQFunctionUpdate) { + // Backend or fallback parent version + CeedQFunctionAssemblyData data; + bool data_is_setup; + CeedVector assembled_vec = NULL; + CeedElemRestriction assembled_rstr = NULL; + + CeedCall(CeedOperatorGetQFunctionAssemblyData(op, &data)); + CeedCall(CeedQFunctionAssemblyDataIsSetup(data, &data_is_setup)); + if (data_is_setup) { + bool update_needed; + + CeedCall(CeedQFunctionAssemblyDataGetObjects(data, &assembled_vec, &assembled_rstr)); + CeedCall(CeedQFunctionAssemblyDataIsUpdateNeeded(data, &update_needed)); + if (update_needed) CeedCall(LinearAssembleQFunctionUpdate(op_assemble, assembled_vec, assembled_rstr, request)); + } else { + CeedCall(CeedOperatorLinearAssembleQFunction(op_assemble, &assembled_vec, &assembled_rstr, request)); + CeedCall(CeedQFunctionAssemblyDataSetObjects(data, assembled_vec, assembled_rstr)); + } + CeedCall(CeedQFunctionAssemblyDataSetUpdateNeeded(data, false)); + + // Copy reference from internally held copy + CeedCall(CeedVectorReferenceCopy(assembled_vec, assembled)); + CeedCall(CeedElemRestrictionReferenceCopy(assembled_rstr, rstr)); + CeedCall(CeedVectorDestroy(&assembled_vec)); + CeedCall(CeedElemRestrictionDestroy(&assembled_rstr)); + } else { + // Operator fallback + CeedOperator op_fallback; + + CeedDebug(CeedOperatorReturnCeed(op), "\nFalling back for CeedOperatorLinearAssembleQFunctionBuildOrUpdate\n"); + CeedCall(CeedOperatorGetFallback(op, &op_fallback)); + if (op_fallback) CeedCall(CeedOperatorLinearAssembleQFunctionBuildOrUpdate(op_fallback, assembled, rstr, request)); + else return CeedError(CeedOperatorReturnCeed(op), CEED_ERROR_UNSUPPORTED, "Backend does not support CeedOperatorLinearAssembleQFunctionUpdate"); + } return CEED_ERROR_SUCCESS; } +/** + @brief Assemble `CeedQFunction` and store result internally, but do not use fallback parent. + + Return copied references of stored data to the caller. + Caller is responsible for ownership and destruction of the copied references. + See also @ref CeedOperatorLinearAssembleQFunction(). + + Note: If the value of `assembled` or `rstr` passed to this function are non-`NULL` , then it is assumed that they hold valid pointers. + These objects will be destroyed if `*assembled` or `*rstr` is the only reference to the object. + + @param[in] op `CeedOperator` to assemble `CeedQFunction` + @param[out] assembled `CeedVector` to store assembled `CeedQFunction` at quadrature points + @param[out] rstr `CeedElemRestriction` for `CeedVector` containing assembled `CeedQFunction` + @param[in] request Address of @ref CeedRequest for non-blocking completion, else @ref CEED_REQUEST_IMMEDIATE + + @return An error code: 0 - success, otherwise - failure + + @ref Developer +**/ +int CeedOperatorLinearAssembleQFunctionBuildOrUpdateFallback(CeedOperator op, CeedVector *assembled, CeedElemRestriction *rstr, + CeedRequest *request) { + return CeedOperatorLinearAssembleQFunctionBuildOrUpdate_Core(op, false, assembled, rstr, request); +} + /** @brief Assemble nonzero entries for non-composite `CeedOperator`. @@ -537,13 +668,11 @@ static int CeedSingleOperatorAssembleSymbolic(CeedOperator op, CeedInt offset, C @ref Developer **/ -static int CeedSingleOperatorAssemble(CeedOperator op, CeedInt offset, CeedVector values) { - Ceed ceed; - bool is_composite; +int CeedOperatorAssembleSingle(CeedOperator op, CeedInt offset, CeedVector values) { + bool is_composite, is_at_points; - CeedCall(CeedOperatorGetCeed(op, &ceed)); CeedCall(CeedOperatorIsComposite(op, &is_composite)); - CeedCheck(!is_composite, ceed, CEED_ERROR_UNSUPPORTED, "Composite operator not supported"); + CeedCheck(!is_composite, CeedOperatorReturnCeed(op), CEED_ERROR_UNSUPPORTED, "Composite operator not supported"); // Early exit for empty operator { @@ -561,13 +690,18 @@ static int CeedSingleOperatorAssemble(CeedOperator op, CeedInt offset, CeedVecto // Operator fallback CeedOperator op_fallback; + CeedDebug(CeedOperatorReturnCeed(op), "\nFalling back for CeedOperatorAssembleSingle\n"); CeedCall(CeedOperatorGetFallback(op, &op_fallback)); if (op_fallback) { - CeedCall(CeedSingleOperatorAssemble(op_fallback, offset, values)); + CeedCall(CeedOperatorAssembleSingle(op_fallback, offset, values)); return CEED_ERROR_SUCCESS; } } + CeedCall(CeedOperatorIsAtPoints(op, &is_at_points)); + CeedCheck(!is_at_points, CeedOperatorReturnCeed(op), CEED_ERROR_UNSUPPORTED, + "Backend does not implement CeedOperatorLinearAssemble for AtPoints operator"); + // Assemble QFunction CeedInt layout_qf[3]; const CeedScalar *assembled_qf_array; @@ -597,9 +731,10 @@ static int CeedSingleOperatorAssemble(CeedOperator op, CeedInt offset, CeedVecto CeedCall(CeedOperatorAssemblyDataGetEvalModes(data, &num_active_bases_in, &num_eval_modes_in, &eval_modes_in, NULL, &num_active_bases_out, &num_eval_modes_out, &eval_modes_out, NULL, NULL)); - CeedCheck(num_active_bases_in == num_active_bases_out && num_active_bases_in == 1, ceed, CEED_ERROR_UNSUPPORTED, + CeedCheck(num_active_bases_in == 1 && num_active_bases_out == 1, CeedOperatorReturnCeed(op), CEED_ERROR_UNSUPPORTED, "Cannot assemble operator with multiple active bases"); - CeedCheck(num_eval_modes_in[0] > 0 && num_eval_modes_out[0] > 0, ceed, CEED_ERROR_UNSUPPORTED, "Cannot assemble operator without inputs/outputs"); + CeedCheck(num_eval_modes_in[0] > 0 && num_eval_modes_out[0] > 0, CeedOperatorReturnCeed(op), CEED_ERROR_UNSUPPORTED, + "Cannot assemble operator without inputs/outputs"); CeedCall(CeedOperatorAssemblyDataGetBases(data, NULL, &active_bases_in, &B_mats_in, NULL, &active_bases_out, &B_mats_out)); CeedCall(CeedOperatorGetActiveElemRestrictions(op, &elem_rstr_in, &elem_rstr_out)); @@ -623,14 +758,18 @@ static int CeedSingleOperatorAssemble(CeedOperator op, CeedInt offset, CeedVecto if (elem_rstr_in != elem_rstr_out) { CeedCall(CeedElemRestrictionGetNumElements(elem_rstr_out, &num_elem_out)); - CeedCheck(num_elem_in == num_elem_out, ceed, CEED_ERROR_UNSUPPORTED, - "Active input and output operator restrictions must have the same number of elements"); + CeedCheck(num_elem_in == num_elem_out, CeedOperatorReturnCeed(op), CEED_ERROR_UNSUPPORTED, + "Active input and output operator restrictions must have the same number of elements." + " Input has %" CeedInt_FMT " elements; output has %" CeedInt_FMT "elements.", + num_elem_in, num_elem_out); CeedCall(CeedElemRestrictionGetElementSize(elem_rstr_out, &elem_size_out)); CeedCall(CeedElemRestrictionGetNumComponents(elem_rstr_out, &num_comp_out)); if (basis_out == CEED_BASIS_NONE) num_qpts_out = elem_size_out; else CeedCall(CeedBasisGetNumQuadraturePoints(basis_out, &num_qpts_out)); - CeedCheck(num_qpts_in == num_qpts_out, ceed, CEED_ERROR_UNSUPPORTED, - "Active input and output bases must have the same number of quadrature points"); + CeedCheck(num_qpts_in == num_qpts_out, CeedOperatorReturnCeed(op), CEED_ERROR_UNSUPPORTED, + "Active input and output bases must have the same number of quadrature points." + " Input has %" CeedInt_FMT " points; output has %" CeedInt_FMT "points.", + num_qpts_in, num_qpts_out); CeedCall(CeedElemRestrictionGetType(elem_rstr_out, &elem_rstr_type_out)); if (elem_rstr_type_out == CEED_RESTRICTION_ORIENTED) { @@ -647,7 +786,7 @@ static int CeedSingleOperatorAssemble(CeedOperator op, CeedInt offset, CeedVecto elem_rstr_orients_out = elem_rstr_orients_in; elem_rstr_curl_orients_out = elem_rstr_curl_orients_in; } - local_num_entries = elem_size_out * num_comp_out * elem_size_in * num_comp_in * num_elem_in; + local_num_entries = (CeedSize)elem_size_out * num_comp_out * elem_size_in * num_comp_in * num_elem_in; // Loop over elements and put in data structure // We store B_mat_in, B_mat_out, BTD, elem_mat in row-major order @@ -687,7 +826,11 @@ static int CeedSingleOperatorAssemble(CeedOperator op, CeedInt offset, CeedVecto CeedCall(CeedTensorContractApply(contract, 1, num_qpts_in * num_eval_modes_in[0], elem_size_in, elem_size_out, BTD_mat, CEED_NOTRANSPOSE, false, B_mat_in, elem_mat)); } else { + Ceed ceed; + + CeedCall(CeedOperatorGetCeed(op, &ceed)); CeedCall(CeedMatrixMatrixMultiply(ceed, BTD_mat, B_mat_in, elem_mat, elem_size_out, elem_size_in, num_qpts_in * num_eval_modes_in[0])); + CeedCall(CeedDestroy(&ceed)); } // Transform the element matrix if required @@ -746,7 +889,7 @@ static int CeedSingleOperatorAssemble(CeedOperator op, CeedInt offset, CeedVecto } } } - CeedCheck(count == local_num_entries, ceed, CEED_ERROR_MAJOR, "Error computing entries"); + CeedCheck(count == local_num_entries, CeedOperatorReturnCeed(op), CEED_ERROR_MAJOR, "Error computing entries"); CeedCall(CeedVectorRestoreArray(values, &vals)); // Cleanup @@ -767,6 +910,8 @@ static int CeedSingleOperatorAssemble(CeedOperator op, CeedInt offset, CeedVecto } CeedCall(CeedVectorRestoreArrayRead(assembled_qf, &assembled_qf_array)); CeedCall(CeedVectorDestroy(&assembled_qf)); + CeedCall(CeedElemRestrictionDestroy(&elem_rstr_in)); + CeedCall(CeedElemRestrictionDestroy(&elem_rstr_out)); return CEED_ERROR_SUCCESS; } @@ -780,15 +925,13 @@ static int CeedSingleOperatorAssemble(CeedOperator op, CeedInt offset, CeedVecto @ref Utility **/ -static int CeedSingleOperatorAssemblyCountEntries(CeedOperator op, CeedSize *num_entries) { +static int CeedOperatorAssemblyCountEntriesSingle(CeedOperator op, CeedSize *num_entries) { bool is_composite; CeedInt num_elem_in, elem_size_in, num_comp_in, num_elem_out, elem_size_out, num_comp_out; - Ceed ceed; CeedElemRestriction rstr_in, rstr_out; - CeedCall(CeedOperatorGetCeed(op, &ceed)); CeedCall(CeedOperatorIsComposite(op, &is_composite)); - CeedCheck(!is_composite, ceed, CEED_ERROR_UNSUPPORTED, "Composite operator not supported"); + CeedCheck(!is_composite, CeedOperatorReturnCeed(op), CEED_ERROR_UNSUPPORTED, "Composite operator not supported"); CeedCall(CeedOperatorGetActiveElemRestrictions(op, &rstr_in, &rstr_out)); CeedCall(CeedElemRestrictionGetNumElements(rstr_in, &num_elem_in)); @@ -796,8 +939,10 @@ static int CeedSingleOperatorAssemblyCountEntries(CeedOperator op, CeedSize *num CeedCall(CeedElemRestrictionGetNumComponents(rstr_in, &num_comp_in)); if (rstr_in != rstr_out) { CeedCall(CeedElemRestrictionGetNumElements(rstr_out, &num_elem_out)); - CeedCheck(num_elem_in == num_elem_out, ceed, CEED_ERROR_UNSUPPORTED, - "Active input and output operator restrictions must have the same number of elements"); + CeedCheck(num_elem_in == num_elem_out, CeedOperatorReturnCeed(op), CEED_ERROR_UNSUPPORTED, + "Active input and output operator restrictions must have the same number of elements." + " Input has %" CeedInt_FMT " elements; output has %" CeedInt_FMT "elements.", + num_elem_in, num_elem_out); CeedCall(CeedElemRestrictionGetElementSize(rstr_out, &elem_size_out)); CeedCall(CeedElemRestrictionGetNumComponents(rstr_out, &num_comp_out)); } else { @@ -805,10 +950,48 @@ static int CeedSingleOperatorAssemblyCountEntries(CeedOperator op, CeedSize *num elem_size_out = elem_size_in; num_comp_out = num_comp_in; } + CeedCall(CeedElemRestrictionDestroy(&rstr_in)); + CeedCall(CeedElemRestrictionDestroy(&rstr_out)); *num_entries = (CeedSize)elem_size_in * num_comp_in * elem_size_out * num_comp_out * num_elem_in; return CEED_ERROR_SUCCESS; } +/** + @brief Count number of entries for assembled `CeedOperator` + + @param[in] op `CeedOperator` to assemble + @param[out] num_entries Number of entries in assembled representation + + @return An error code: 0 - success, otherwise - failure + + @ref Utility +**/ +int CeedOperatorLinearAssembleGetNumEntries(CeedOperator op, CeedSize *num_entries) { + bool is_composite; + + CeedCall(CeedOperatorCheckReady(op)); + CeedCall(CeedOperatorIsComposite(op, &is_composite)); + + if (is_composite) { + CeedInt num_suboperators; + CeedOperator *sub_operators; + + CeedCall(CeedOperatorCompositeGetNumSub(op, &num_suboperators)); + CeedCall(CeedOperatorCompositeGetSubList(op, &sub_operators)); + + *num_entries = 0; + for (CeedInt k = 0; k < num_suboperators; ++k) { + CeedSize single_entries; + + CeedCall(CeedOperatorAssemblyCountEntriesSingle(sub_operators[k], &single_entries)); + *num_entries += single_entries; + } + } else { + CeedCall(CeedOperatorAssemblyCountEntriesSingle(op, num_entries)); + } + return CEED_ERROR_SUCCESS; +} + /** @brief Common code for creating a multigrid coarse `CeedOperator` and level transfer `CeedOperator` for a `CeedOperator` @@ -825,11 +1008,12 @@ static int CeedSingleOperatorAssemblyCountEntries(CeedOperator op, CeedSize *num @ref Developer **/ -static int CeedSingleOperatorMultigridLevel(CeedOperator op_fine, CeedVector p_mult_fine, CeedElemRestriction rstr_coarse, CeedBasis basis_coarse, - CeedBasis basis_c_to_f, CeedOperator *op_coarse, CeedOperator *op_prolong, CeedOperator *op_restrict) { +static int CeedOperatorMultigridLevelCreateSingle_Core(CeedOperator op_fine, CeedVector p_mult_fine, CeedElemRestriction rstr_coarse, + CeedBasis basis_coarse, CeedBasis basis_c_to_f, CeedOperator *op_coarse, + CeedOperator *op_prolong, CeedOperator *op_restrict) { bool is_composite; Ceed ceed; - CeedInt num_comp, num_input_fields, num_output_fields; + CeedInt dim = 0, num_comp, num_input_fields, num_output_fields; CeedVector mult_vec = NULL; CeedElemRestriction rstr_p_mult_fine = NULL, rstr_fine = NULL; CeedOperatorField *input_fields, *output_fields; @@ -841,66 +1025,144 @@ static int CeedSingleOperatorMultigridLevel(CeedOperator op_fine, CeedVector p_m CeedCheck(!is_composite, ceed, CEED_ERROR_UNSUPPORTED, "Automatic multigrid setup for composite operators not supported"); // Coarse Grid - CeedCall(CeedOperatorCreate(ceed, op_fine->qf, op_fine->dqf, op_fine->dqfT, op_coarse)); + { + bool is_at_points; + + CeedCall(CeedOperatorIsAtPoints(op_fine, &is_at_points)); + if (is_at_points) { + CeedVector point_coords; + CeedElemRestriction rstr_points; + + CeedCall(CeedOperatorCreateAtPoints(ceed, op_fine->qf, op_fine->dqf, op_fine->dqfT, op_coarse)); + CeedCall(CeedOperatorAtPointsGetPoints(op_fine, &rstr_points, &point_coords)); + CeedCall(CeedOperatorAtPointsSetPoints(*op_coarse, rstr_points, point_coords)); + CeedCall(CeedVectorDestroy(&point_coords)); + CeedCall(CeedElemRestrictionDestroy(&rstr_points)); + } else { + CeedCall(CeedOperatorCreate(ceed, op_fine->qf, op_fine->dqf, op_fine->dqfT, op_coarse)); + } + } CeedCall(CeedOperatorGetFields(op_fine, &num_input_fields, &input_fields, &num_output_fields, &output_fields)); // -- Clone input fields for (CeedInt i = 0; i < num_input_fields; i++) { const char *field_name; CeedVector vec; - CeedElemRestriction rstr; - CeedBasis basis; + CeedElemRestriction rstr = NULL; + CeedBasis basis = NULL; CeedCall(CeedOperatorFieldGetName(input_fields[i], &field_name)); CeedCall(CeedOperatorFieldGetVector(input_fields[i], &vec)); if (vec == CEED_VECTOR_ACTIVE) { - rstr = rstr_coarse; - basis = basis_coarse; - CeedCall(CeedOperatorFieldGetElemRestriction(input_fields[i], &rstr_fine)); + CeedCall(CeedElemRestrictionReferenceCopy(rstr_coarse, &rstr)); + CeedCall(CeedBasisReferenceCopy(basis_coarse, &basis)); + if (!rstr_fine) CeedCall(CeedOperatorFieldGetElemRestriction(input_fields[i], &rstr_fine)); } else { CeedCall(CeedOperatorFieldGetElemRestriction(input_fields[i], &rstr)); CeedCall(CeedOperatorFieldGetBasis(input_fields[i], &basis)); } + if (dim == 0) CeedCall(CeedBasisGetDimension(basis, &dim)); CeedCall(CeedOperatorSetField(*op_coarse, field_name, rstr, basis, vec)); + CeedCall(CeedVectorDestroy(&vec)); + CeedCall(CeedElemRestrictionDestroy(&rstr)); + CeedCall(CeedBasisDestroy(&basis)); } // -- Clone output fields for (CeedInt i = 0; i < num_output_fields; i++) { const char *field_name; CeedVector vec; - CeedElemRestriction rstr; - CeedBasis basis; + CeedElemRestriction rstr = NULL; + CeedBasis basis = NULL; CeedCall(CeedOperatorFieldGetName(output_fields[i], &field_name)); CeedCall(CeedOperatorFieldGetVector(output_fields[i], &vec)); if (vec == CEED_VECTOR_ACTIVE) { - rstr = rstr_coarse; - basis = basis_coarse; - CeedCall(CeedOperatorFieldGetElemRestriction(output_fields[i], &rstr_fine)); + CeedCall(CeedElemRestrictionReferenceCopy(rstr_coarse, &rstr)); + CeedCall(CeedBasisReferenceCopy(basis_coarse, &basis)); + if (!rstr_fine) CeedCall(CeedOperatorFieldGetElemRestriction(output_fields[i], &rstr_fine)); } else { CeedCall(CeedOperatorFieldGetElemRestriction(output_fields[i], &rstr)); CeedCall(CeedOperatorFieldGetBasis(output_fields[i], &basis)); } + if (dim == 0) CeedCall(CeedBasisGetDimension(basis, &dim)); CeedCall(CeedOperatorSetField(*op_coarse, field_name, rstr, basis, vec)); + CeedCall(CeedVectorDestroy(&vec)); + CeedCall(CeedElemRestrictionDestroy(&rstr)); + CeedCall(CeedBasisDestroy(&basis)); } + dim = dim ? dim : 1; // -- Clone QFunctionAssemblyData - CeedCall(CeedQFunctionAssemblyDataReferenceCopy(op_fine->qf_assembled, &(*op_coarse)->qf_assembled)); + { + CeedQFunctionAssemblyData fine_data; + + CeedCall(CeedOperatorGetQFunctionAssemblyData(op_fine, &fine_data)); + CeedCall(CeedQFunctionAssemblyDataReferenceCopy(fine_data, &(*op_coarse)->qf_assembled)); + } // Multiplicity vector + bool use_scalar_mult = true; + if (op_restrict || op_prolong) { - CeedVector mult_e_vec; + CeedInt num_elem, num_comp, elem_size; + CeedVector mult_l_vec, mult_e_vec; CeedRestrictionType rstr_type; + CeedElemRestriction rstr_p_mult_full; CeedCall(CeedElemRestrictionGetType(rstr_fine, &rstr_type)); CeedCheck(rstr_type != CEED_RESTRICTION_CURL_ORIENTED, ceed, CEED_ERROR_UNSUPPORTED, "Element restrictions created with CeedElemRestrictionCreateCurlOriented are not supported"); CeedCheck(p_mult_fine, ceed, CEED_ERROR_INCOMPATIBLE, "Prolongation or restriction operator creation requires fine grid multiplicity vector"); - CeedCall(CeedElemRestrictionCreateUnsignedCopy(rstr_fine, &rstr_p_mult_fine)); - CeedCall(CeedElemRestrictionCreateVector(rstr_fine, &mult_vec, &mult_e_vec)); + + // Create multiplicity multi-component l-vector + CeedCall(CeedElemRestrictionCreateUnsignedCopy(rstr_fine, &rstr_p_mult_full)); + CeedCall(CeedElemRestrictionGetNumElements(rstr_p_mult_full, &num_elem)); + CeedCall(CeedElemRestrictionGetNumComponents(rstr_p_mult_full, &num_comp)); + CeedCall(CeedElemRestrictionGetElementSize(rstr_p_mult_full, &elem_size)); + CeedCall(CeedElemRestrictionCreateVector(rstr_fine, &mult_l_vec, &mult_e_vec)); CeedCall(CeedVectorSetValue(mult_e_vec, 0.0)); - CeedCall(CeedElemRestrictionApply(rstr_p_mult_fine, CEED_NOTRANSPOSE, p_mult_fine, mult_e_vec, CEED_REQUEST_IMMEDIATE)); - CeedCall(CeedVectorSetValue(mult_vec, 0.0)); - CeedCall(CeedElemRestrictionApply(rstr_p_mult_fine, CEED_TRANSPOSE, mult_e_vec, mult_vec, CEED_REQUEST_IMMEDIATE)); + CeedCall(CeedElemRestrictionApply(rstr_p_mult_full, CEED_NOTRANSPOSE, p_mult_fine, mult_e_vec, CEED_REQUEST_IMMEDIATE)); + CeedCall(CeedVectorSetValue(mult_l_vec, 0.0)); + CeedCall(CeedElemRestrictionApply(rstr_p_mult_full, CEED_TRANSPOSE, mult_e_vec, mult_l_vec, CEED_REQUEST_IMMEDIATE)); + CeedCall(CeedVectorReciprocal(mult_l_vec)); + + // Determine to use scalar multiplicity or not + { + const CeedInt p = pow(elem_size, 1.0 / dim); + + use_scalar_mult = num_comp > 1 && (dim < 3 || num_comp - 1 > (3 * (pow(p, dim - 1) - pow(p, dim - 2)) + 1) / pow(p - 1, dim)); + } + + if (use_scalar_mult) { + // Create multiplicity single component e-vector + CeedCall(CeedElemRestrictionCreateStrided(ceed, num_elem, elem_size, 1, num_elem * elem_size, CEED_STRIDES_BACKEND, &rstr_p_mult_fine)); + CeedCall(CeedElemRestrictionCreateVector(rstr_p_mult_fine, &mult_vec, NULL)); + { + CeedQFunction qf_to_scalar; + CeedOperator op_to_scalar; + + CeedCall(CeedQFunctionCreateInteriorByName(ceed, "Identity to scalar", &qf_to_scalar)); + CeedCall(CeedQFunctionAddInput(qf_to_scalar, "input", num_comp, CEED_EVAL_NONE)); + CeedCall(CeedQFunctionAddOutput(qf_to_scalar, "output", 1, CEED_EVAL_NONE)); + + CeedCall(CeedOperatorCreate(ceed, qf_to_scalar, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_to_scalar)); + CeedCall(CeedOperatorSetField(op_to_scalar, "input", rstr_p_mult_full, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE)); + CeedCall(CeedOperatorSetField(op_to_scalar, "output", rstr_p_mult_fine, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE)); + + CeedCall(CeedOperatorApply(op_to_scalar, mult_l_vec, mult_vec, CEED_REQUEST_IMMEDIATE)); + + // Clean-up + CeedCall(CeedQFunctionDestroy(&qf_to_scalar)); + CeedCall(CeedOperatorDestroy(&op_to_scalar)); + } + } else { + mult_vec = NULL; + CeedCall(CeedVectorReferenceCopy(mult_l_vec, &mult_vec)); + rstr_p_mult_fine = NULL; + CeedCall(CeedElemRestrictionReferenceCopy(rstr_p_mult_full, &rstr_p_mult_fine)); + } + // Clean-up CeedCall(CeedVectorDestroy(&mult_e_vec)); - CeedCall(CeedVectorReciprocal(mult_vec)); + CeedCall(CeedVectorDestroy(&mult_l_vec)); + CeedCall(CeedElemRestrictionDestroy(&rstr_p_mult_full)); } // Clone name @@ -921,7 +1183,7 @@ static int CeedSingleOperatorMultigridLevel(CeedOperator op_fine, CeedVector p_m CeedQFunctionContext ctx_r; CeedQFunction qf_restrict; - CeedCall(CeedQFunctionCreateInteriorByName(ceed, "Scale", &qf_restrict)); + CeedCall(CeedQFunctionCreateInteriorByName(ceed, use_scalar_mult ? "Scale (scalar)" : "Scale", &qf_restrict)); CeedCall(CeedCalloc(1, &num_comp_r_data)); num_comp_r_data[0] = num_comp; CeedCall(CeedQFunctionContextCreate(ceed, &ctx_r)); @@ -929,7 +1191,7 @@ static int CeedSingleOperatorMultigridLevel(CeedOperator op_fine, CeedVector p_m CeedCall(CeedQFunctionSetContext(qf_restrict, ctx_r)); CeedCall(CeedQFunctionContextDestroy(&ctx_r)); CeedCall(CeedQFunctionAddInput(qf_restrict, "input", num_comp, CEED_EVAL_NONE)); - CeedCall(CeedQFunctionAddInput(qf_restrict, "scale", num_comp, CEED_EVAL_NONE)); + CeedCall(CeedQFunctionAddInput(qf_restrict, "scale", use_scalar_mult ? 1 : num_comp, CEED_EVAL_NONE)); CeedCall(CeedQFunctionAddOutput(qf_restrict, "output", num_comp, CEED_EVAL_INTERP)); CeedCall(CeedQFunctionSetUserFlopsEstimate(qf_restrict, num_comp)); @@ -959,7 +1221,7 @@ static int CeedSingleOperatorMultigridLevel(CeedOperator op_fine, CeedVector p_m CeedQFunctionContext ctx_p; CeedQFunction qf_prolong; - CeedCall(CeedQFunctionCreateInteriorByName(ceed, "Scale", &qf_prolong)); + CeedCall(CeedQFunctionCreateInteriorByName(ceed, use_scalar_mult ? "Scale (scalar)" : "Scale", &qf_prolong)); CeedCall(CeedCalloc(1, &num_comp_p_data)); num_comp_p_data[0] = num_comp; CeedCall(CeedQFunctionContextCreate(ceed, &ctx_p)); @@ -967,7 +1229,7 @@ static int CeedSingleOperatorMultigridLevel(CeedOperator op_fine, CeedVector p_m CeedCall(CeedQFunctionSetContext(qf_prolong, ctx_p)); CeedCall(CeedQFunctionContextDestroy(&ctx_p)); CeedCall(CeedQFunctionAddInput(qf_prolong, "input", num_comp, CEED_EVAL_INTERP)); - CeedCall(CeedQFunctionAddInput(qf_prolong, "scale", num_comp, CEED_EVAL_NONE)); + CeedCall(CeedQFunctionAddInput(qf_prolong, "scale", use_scalar_mult ? 1 : num_comp, CEED_EVAL_NONE)); CeedCall(CeedQFunctionAddOutput(qf_prolong, "output", num_comp, CEED_EVAL_NONE)); CeedCall(CeedQFunctionSetUserFlopsEstimate(qf_prolong, num_comp)); @@ -995,7 +1257,9 @@ static int CeedSingleOperatorMultigridLevel(CeedOperator op_fine, CeedVector p_m CeedCall(CeedOperatorCheckReady(*op_coarse)); // Cleanup + CeedCall(CeedDestroy(&ceed)); CeedCall(CeedVectorDestroy(&mult_vec)); + CeedCall(CeedElemRestrictionDestroy(&rstr_fine)); CeedCall(CeedElemRestrictionDestroy(&rstr_p_mult_fine)); CeedCall(CeedBasisDestroy(&basis_c_to_f)); return CEED_ERROR_SUCCESS; @@ -1122,6 +1386,28 @@ int CeedOperatorCreateActivePointBlockRestriction(CeedElemRestriction rstr, Ceed // Cleanup CeedCall(CeedElemRestrictionRestoreOffsets(rstr, &offsets)); + CeedCall(CeedDestroy(&ceed)); + return CEED_ERROR_SUCCESS; +} + +/** + @brief Get `CeedQFunctionAssemblyData` + + @param[in] op `CeedOperator` to assemble + @param[out] data `CeedQFunctionAssemblyData` + + @return An error code: 0 - success, otherwise - failure + + @ref Backend +**/ +int CeedOperatorGetQFunctionAssemblyData(CeedOperator op, CeedQFunctionAssemblyData *data) { + if (!op->qf_assembled) { + CeedQFunctionAssemblyData data; + + CeedCall(CeedQFunctionAssemblyDataCreate(CeedOperatorReturnCeed(op), &data)); + op->qf_assembled = data; + } + *data = op->qf_assembled; return CEED_ERROR_SUCCESS; } @@ -1138,8 +1424,7 @@ int CeedOperatorCreateActivePointBlockRestriction(CeedElemRestriction rstr, Ceed int CeedQFunctionAssemblyDataCreate(Ceed ceed, CeedQFunctionAssemblyData *data) { CeedCall(CeedCalloc(1, data)); (*data)->ref_count = 1; - (*data)->ceed = ceed; - CeedCall(CeedReference(ceed)); + CeedCall(CeedReferenceCopy(ceed, &(*data)->ceed)); return CEED_ERROR_SUCCESS; } @@ -1304,7 +1589,7 @@ int CeedQFunctionAssemblyDataDestroy(CeedQFunctionAssemblyData *data) { @brief Get `CeedOperatorAssemblyData` @param[in] op `CeedOperator` to assemble - @param[out] data `CeedQFunctionAssemblyData` + @param[out] data `CeedOperatorAssemblyData` @return An error code: 0 - success, otherwise - failure @@ -1314,7 +1599,7 @@ int CeedOperatorGetOperatorAssemblyData(CeedOperator op, CeedOperatorAssemblyDat if (!op->op_assembled) { CeedOperatorAssemblyData data; - CeedCall(CeedOperatorAssemblyDataCreate(op->ceed, op, &data)); + CeedCall(CeedOperatorAssemblyDataCreate(CeedOperatorReturnCeed(op), op, &data)); op->op_assembled = data; } *data = op->op_assembled; @@ -1354,8 +1639,7 @@ int CeedOperatorAssemblyDataCreate(Ceed ceed, CeedOperator op, CeedOperatorAssem // Allocate CeedCall(CeedCalloc(1, data)); - (*data)->ceed = ceed; - CeedCall(CeedReference(ceed)); + CeedCall(CeedReferenceCopy(ceed, &(*data)->ceed)); // Build OperatorAssembly data CeedCall(CeedOperatorGetQFunction(op, &qf)); @@ -1390,6 +1674,7 @@ int CeedOperatorAssemblyDataCreate(Ceed ceed, CeedOperator op, CeedOperatorAssem (*data)->active_elem_rstrs_in[num_active_bases_in] = NULL; CeedCall(CeedOperatorFieldGetElemRestriction(op_fields[i], &elem_rstr_in)); CeedCall(CeedElemRestrictionReferenceCopy(elem_rstr_in, &(*data)->active_elem_rstrs_in[num_active_bases_in])); + CeedCall(CeedElemRestrictionDestroy(&elem_rstr_in)); CeedCall(CeedRealloc(num_active_bases_in + 1, &num_eval_modes_in)); num_eval_modes_in[index] = 0; CeedCall(CeedRealloc(num_active_bases_in + 1, &eval_modes_in)); @@ -1411,7 +1696,9 @@ int CeedOperatorAssemblyDataCreate(Ceed ceed, CeedOperator op, CeedOperatorAssem } num_eval_modes_in[index] += q_comp; } + CeedCall(CeedBasisDestroy(&basis_in)); } + CeedCall(CeedVectorDestroy(&vec)); } // Determine active output basis @@ -1445,6 +1732,7 @@ int CeedOperatorAssemblyDataCreate(Ceed ceed, CeedOperator op, CeedOperatorAssem (*data)->active_elem_rstrs_out[num_active_bases_out] = NULL; CeedCall(CeedOperatorFieldGetElemRestriction(op_fields[i], &elem_rstr_out)); CeedCall(CeedElemRestrictionReferenceCopy(elem_rstr_out, &(*data)->active_elem_rstrs_out[num_active_bases_out])); + CeedCall(CeedElemRestrictionDestroy(&elem_rstr_out)); CeedCall(CeedRealloc(num_active_bases_out + 1, &num_eval_modes_out)); num_eval_modes_out[index] = 0; CeedCall(CeedRealloc(num_active_bases_out + 1, &eval_modes_out)); @@ -1466,8 +1754,11 @@ int CeedOperatorAssemblyDataCreate(Ceed ceed, CeedOperator op, CeedOperatorAssem } num_eval_modes_out[index] += q_comp; } + CeedCall(CeedBasisDestroy(&basis_out)); } + CeedCall(CeedVectorDestroy(&vec)); } + CeedCall(CeedQFunctionDestroy(&qf)); (*data)->num_active_bases_in = num_active_bases_in; (*data)->num_eval_modes_in = num_eval_modes_in; (*data)->eval_modes_in = eval_modes_in; @@ -1736,15 +2027,19 @@ int CeedOperatorGetFallback(CeedOperator op, CeedOperator *op_fallback) { CeedCall(CeedIsDebug(ceed, &is_debug)); if (is_debug) { Ceed ceed_fallback; - const char *resource, *resource_fallback; + const char *resource, *resource_fallback, *op_name; CeedCall(CeedGetOperatorFallbackCeed(ceed, &ceed_fallback)); CeedCall(CeedGetResource(ceed, &resource)); CeedCall(CeedGetResource(ceed_fallback, &resource_fallback)); + CeedCall(CeedOperatorGetName(op, &op_name)); CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "---------- CeedOperator Fallback ----------\n"); - CeedDebug(ceed, "Falling back from %s operator at address %p to %s operator at address %p\n", resource, op, resource_fallback, op->op_fallback); + CeedDebug(ceed, "Falling back from Operator with backend %s at address %p to Operator with backend %s at address %p for CeedOperator \"%s\"\n", + resource, op, resource_fallback, op->op_fallback, op_name); + CeedCall(CeedDestroy(&ceed_fallback)); } + CeedCall(CeedDestroy(&ceed)); } *op_fallback = op->op_fallback; return CEED_ERROR_SUCCESS; @@ -1776,7 +2071,9 @@ int CeedOperatorGetFallbackParent(CeedOperator op, CeedOperator *parent) { @ref Backend **/ int CeedOperatorGetFallbackParentCeed(CeedOperator op, Ceed *parent) { - *parent = op->op_fallback_parent ? op->op_fallback_parent->ceed : op->ceed; + *parent = NULL; + if (op->op_fallback_parent) CeedCall(CeedReferenceCopy(CeedOperatorReturnCeed(op->op_fallback_parent), parent)); + else CeedCall(CeedReferenceCopy(CeedOperatorReturnCeed(op), parent)); return CEED_ERROR_SUCCESS; } @@ -1816,13 +2113,12 @@ int CeedOperatorLinearAssembleQFunction(CeedOperator op, CeedVector *assembled, CeedCall(op->LinearAssembleQFunction(op, assembled, rstr, request)); } else { // Operator fallback - Ceed ceed; CeedOperator op_fallback; - CeedCall(CeedOperatorGetCeed(op, &ceed)); + CeedDebug(CeedOperatorReturnCeed(op), "\nFalling back for CeedOperatorLinearAssembleQFunction\n"); CeedCall(CeedOperatorGetFallback(op, &op_fallback)); if (op_fallback) CeedCall(CeedOperatorLinearAssembleQFunction(op_fallback, assembled, rstr, request)); - else return CeedError(ceed, CEED_ERROR_UNSUPPORTED, "Backend does not support CeedOperatorLinearAssembleQFunction"); + else return CeedError(CeedOperatorReturnCeed(op), CEED_ERROR_UNSUPPORTED, "Backend does not support CeedOperatorLinearAssembleQFunction"); } return CEED_ERROR_SUCCESS; } @@ -1847,60 +2143,7 @@ int CeedOperatorLinearAssembleQFunction(CeedOperator op, CeedVector *assembled, @ref User **/ int CeedOperatorLinearAssembleQFunctionBuildOrUpdate(CeedOperator op, CeedVector *assembled, CeedElemRestriction *rstr, CeedRequest *request) { - int (*LinearAssembleQFunctionUpdate)(CeedOperator, CeedVector, CeedElemRestriction, CeedRequest *) = NULL; - CeedOperator op_assemble = NULL; - CeedOperator op_fallback_parent = NULL; - - CeedCall(CeedOperatorCheckReady(op)); - - // Determine if fallback parent or operator has implementation - CeedCall(CeedOperatorGetFallbackParent(op, &op_fallback_parent)); - if (op_fallback_parent && op_fallback_parent->LinearAssembleQFunctionUpdate) { - // -- Backend version for op fallback parent is faster, if it exists - LinearAssembleQFunctionUpdate = op_fallback_parent->LinearAssembleQFunctionUpdate; - op_assemble = op_fallback_parent; - } else if (op->LinearAssembleQFunctionUpdate) { - // -- Backend version for op - LinearAssembleQFunctionUpdate = op->LinearAssembleQFunctionUpdate; - op_assemble = op; - } - - // Assemble QFunction - if (LinearAssembleQFunctionUpdate) { - // Backend or fallback parent version - bool qf_assembled_is_setup; - CeedVector assembled_vec = NULL; - CeedElemRestriction assembled_rstr = NULL; - - CeedCall(CeedQFunctionAssemblyDataIsSetup(op->qf_assembled, &qf_assembled_is_setup)); - if (qf_assembled_is_setup) { - bool update_needed; - - CeedCall(CeedQFunctionAssemblyDataGetObjects(op->qf_assembled, &assembled_vec, &assembled_rstr)); - CeedCall(CeedQFunctionAssemblyDataIsUpdateNeeded(op->qf_assembled, &update_needed)); - if (update_needed) CeedCall(LinearAssembleQFunctionUpdate(op_assemble, assembled_vec, assembled_rstr, request)); - } else { - CeedCall(CeedOperatorLinearAssembleQFunction(op_assemble, &assembled_vec, &assembled_rstr, request)); - CeedCall(CeedQFunctionAssemblyDataSetObjects(op->qf_assembled, assembled_vec, assembled_rstr)); - } - CeedCall(CeedQFunctionAssemblyDataSetUpdateNeeded(op->qf_assembled, false)); - - // Copy reference from internally held copy - CeedCall(CeedVectorReferenceCopy(assembled_vec, assembled)); - CeedCall(CeedElemRestrictionReferenceCopy(assembled_rstr, rstr)); - CeedCall(CeedVectorDestroy(&assembled_vec)); - CeedCall(CeedElemRestrictionDestroy(&assembled_rstr)); - } else { - // Operator fallback - Ceed ceed; - CeedOperator op_fallback; - - CeedCall(CeedOperatorGetCeed(op, &ceed)); - CeedCall(CeedOperatorGetFallback(op, &op_fallback)); - if (op_fallback) CeedCall(CeedOperatorLinearAssembleQFunctionBuildOrUpdate(op_fallback, assembled, rstr, request)); - else return CeedError(ceed, CEED_ERROR_UNSUPPORTED, "Backend does not support CeedOperatorLinearAssembleQFunctionUpdate"); - } - return CEED_ERROR_SUCCESS; + return CeedOperatorLinearAssembleQFunctionBuildOrUpdate_Core(op, true, assembled, rstr, request); } /** @@ -1923,14 +2166,12 @@ int CeedOperatorLinearAssembleQFunctionBuildOrUpdate(CeedOperator op, CeedVector int CeedOperatorLinearAssembleDiagonal(CeedOperator op, CeedVector assembled, CeedRequest *request) { bool is_composite; CeedSize input_size = 0, output_size = 0; - Ceed ceed; - CeedCall(CeedOperatorGetCeed(op, &ceed)); CeedCall(CeedOperatorCheckReady(op)); CeedCall(CeedOperatorIsComposite(op, &is_composite)); CeedCall(CeedOperatorGetActiveVectorLengths(op, &input_size, &output_size)); - CeedCheck(input_size == output_size, ceed, CEED_ERROR_DIMENSION, "Operator must be square"); + CeedCheck(input_size == output_size, CeedOperatorReturnCeed(op), CEED_ERROR_DIMENSION, "Operator must be square"); // Early exit for empty operator if (!is_composite) { @@ -1949,10 +2190,16 @@ int CeedOperatorLinearAssembleDiagonal(CeedOperator op, CeedVector assembled, Ce CeedCall(CeedVectorSetValue(assembled, 0.0)); CeedCall(op->LinearAssembleAddDiagonal(op, assembled, request)); return CEED_ERROR_SUCCESS; + } else if (is_composite) { + // Default to summing contributions of suboperators + CeedCall(CeedVectorSetValue(assembled, 0.0)); + CeedCall(CeedOperatorLinearAssembleAddDiagonalComposite(op, request, false, assembled)); + return CEED_ERROR_SUCCESS; } else { // Operator fallback CeedOperator op_fallback; + CeedDebug(CeedOperatorReturnCeed(op), "\nFalling back for CeedOperatorLinearAssembleDiagonal\n"); CeedCall(CeedOperatorGetFallback(op, &op_fallback)); if (op_fallback) { CeedCall(CeedOperatorLinearAssembleDiagonal(op_fallback, assembled, request)); @@ -1985,14 +2232,12 @@ int CeedOperatorLinearAssembleDiagonal(CeedOperator op, CeedVector assembled, Ce int CeedOperatorLinearAssembleAddDiagonal(CeedOperator op, CeedVector assembled, CeedRequest *request) { bool is_composite; CeedSize input_size = 0, output_size = 0; - Ceed ceed; - CeedCall(CeedOperatorGetCeed(op, &ceed)); CeedCall(CeedOperatorCheckReady(op)); CeedCall(CeedOperatorIsComposite(op, &is_composite)); CeedCall(CeedOperatorGetActiveVectorLengths(op, &input_size, &output_size)); - CeedCheck(input_size == output_size, ceed, CEED_ERROR_DIMENSION, "Operator must be square"); + CeedCheck(input_size == output_size, CeedOperatorReturnCeed(op), CEED_ERROR_DIMENSION, "Operator must be square"); // Early exit for empty operator if (!is_composite) { @@ -2006,10 +2251,15 @@ int CeedOperatorLinearAssembleAddDiagonal(CeedOperator op, CeedVector assembled, // Backend version CeedCall(op->LinearAssembleAddDiagonal(op, assembled, request)); return CEED_ERROR_SUCCESS; + } else if (is_composite) { + // Default to summing contributions of suboperators + CeedCall(CeedOperatorLinearAssembleAddDiagonalComposite(op, request, false, assembled)); + return CEED_ERROR_SUCCESS; } else { // Operator fallback CeedOperator op_fallback; + CeedDebug(CeedOperatorReturnCeed(op), "\nFalling back for CeedOperatorLinearAssembleAddDiagonal\n"); CeedCall(CeedOperatorGetFallback(op, &op_fallback)); if (op_fallback) { CeedCall(CeedOperatorLinearAssembleAddDiagonal(op_fallback, assembled, request)); @@ -2017,11 +2267,7 @@ int CeedOperatorLinearAssembleAddDiagonal(CeedOperator op, CeedVector assembled, } } // Default interface implementation - if (is_composite) { - CeedCall(CeedCompositeOperatorLinearAssembleAddDiagonal(op, request, false, assembled)); - } else { - CeedCall(CeedSingleOperatorLinearAssembleAddDiagonal(op, request, false, assembled)); - } + CeedCall(CeedOperatorLinearAssembleAddDiagonalSingle(op, request, false, assembled)); return CEED_ERROR_SUCCESS; } @@ -2046,21 +2292,19 @@ int CeedOperatorLinearAssembleAddDiagonal(CeedOperator op, CeedVector assembled, @ref User **/ int CeedOperatorLinearAssemblePointBlockDiagonalSymbolic(CeedOperator op, CeedSize *num_entries, CeedInt **rows, CeedInt **cols) { - Ceed ceed; bool is_composite; CeedInt num_active_components, num_sub_operators; CeedOperator *sub_operators; - CeedCall(CeedOperatorGetCeed(op, &ceed)); CeedCall(CeedOperatorIsComposite(op, &is_composite)); CeedSize input_size = 0, output_size = 0; CeedCall(CeedOperatorGetActiveVectorLengths(op, &input_size, &output_size)); - CeedCheck(input_size == output_size, ceed, CEED_ERROR_DIMENSION, "Operator must be square"); + CeedCheck(input_size == output_size, CeedOperatorReturnCeed(op), CEED_ERROR_DIMENSION, "Operator must be square"); if (is_composite) { - CeedCall(CeedCompositeOperatorGetNumSub(op, &num_sub_operators)); - CeedCall(CeedCompositeOperatorGetSubList(op, &sub_operators)); + CeedCall(CeedOperatorCompositeGetNumSub(op, &num_sub_operators)); + CeedCall(CeedOperatorCompositeGetSubList(op, &sub_operators)); } else { sub_operators = &op; num_sub_operators = 1; @@ -2086,11 +2330,13 @@ int CeedOperatorLinearAssemblePointBlockDiagonalSymbolic(CeedOperator op, CeedSi CeedInt comp_stride_sub, num_active_components_sub; CeedCall(CeedElemRestrictionGetCompStride(active_elem_rstrs[i], &comp_stride_sub)); - CeedCheck(comp_stride == comp_stride_sub, ceed, CEED_ERROR_DIMENSION, + CeedCheck(comp_stride == comp_stride_sub, CeedOperatorReturnCeed(op), CEED_ERROR_DIMENSION, "Active element restrictions must have the same component stride: %d vs %d", comp_stride, comp_stride_sub); CeedCall(CeedElemRestrictionGetNumComponents(active_elem_rstrs[i], &num_active_components_sub)); - CeedCheck(num_active_components == num_active_components_sub, ceed, CEED_ERROR_INCOMPATIBLE, - "All suboperators must have the same number of output components"); + CeedCheck(num_active_components == num_active_components_sub, CeedOperatorReturnCeed(op), CEED_ERROR_INCOMPATIBLE, + "All suboperators must have the same number of output components." + " Previous: %" CeedInt_FMT " Current: %" CeedInt_FMT, + num_active_components, num_active_components_sub); } } } @@ -2123,6 +2369,7 @@ int CeedOperatorLinearAssemblePointBlockDiagonalSymbolic(CeedOperator op, CeedSi CeedCall(CeedElemRestrictionRestoreOffsets(active_elem_rstr, &offsets)); CeedCall(CeedElemRestrictionRestoreOffsets(point_block_active_elem_rstr, &point_block_offsets)); + CeedCall(CeedElemRestrictionDestroy(&active_elem_rstr)); CeedCall(CeedElemRestrictionDestroy(&point_block_active_elem_rstr)); } return CEED_ERROR_SUCCESS; @@ -2150,14 +2397,12 @@ int CeedOperatorLinearAssemblePointBlockDiagonalSymbolic(CeedOperator op, CeedSi int CeedOperatorLinearAssemblePointBlockDiagonal(CeedOperator op, CeedVector assembled, CeedRequest *request) { bool is_composite; CeedSize input_size = 0, output_size = 0; - Ceed ceed; - CeedCall(CeedOperatorGetCeed(op, &ceed)); CeedCall(CeedOperatorCheckReady(op)); CeedCall(CeedOperatorIsComposite(op, &is_composite)); CeedCall(CeedOperatorGetActiveVectorLengths(op, &input_size, &output_size)); - CeedCheck(input_size == output_size, ceed, CEED_ERROR_DIMENSION, "Operator must be square"); + CeedCheck(input_size == output_size, CeedOperatorReturnCeed(op), CEED_ERROR_DIMENSION, "Operator must be square"); // Early exit for empty operator if (!is_composite) { @@ -2180,6 +2425,7 @@ int CeedOperatorLinearAssemblePointBlockDiagonal(CeedOperator op, CeedVector ass // Operator fallback CeedOperator op_fallback; + CeedDebug(CeedOperatorReturnCeed(op), "\nFalling back for CeedOperatorLinearAssemblePointBlockDiagonal\n"); CeedCall(CeedOperatorGetFallback(op, &op_fallback)); if (op_fallback) { CeedCall(CeedOperatorLinearAssemblePointBlockDiagonal(op_fallback, assembled, request)); @@ -2214,14 +2460,12 @@ int CeedOperatorLinearAssemblePointBlockDiagonal(CeedOperator op, CeedVector ass int CeedOperatorLinearAssembleAddPointBlockDiagonal(CeedOperator op, CeedVector assembled, CeedRequest *request) { bool is_composite; CeedSize input_size = 0, output_size = 0; - Ceed ceed; - CeedCall(CeedOperatorGetCeed(op, &ceed)); CeedCall(CeedOperatorCheckReady(op)); CeedCall(CeedOperatorIsComposite(op, &is_composite)); CeedCall(CeedOperatorGetActiveVectorLengths(op, &input_size, &output_size)); - CeedCheck(input_size == output_size, ceed, CEED_ERROR_DIMENSION, "Operator must be square"); + CeedCheck(input_size == output_size, CeedOperatorReturnCeed(op), CEED_ERROR_DIMENSION, "Operator must be square"); // Early exit for empty operator if (!is_composite) { @@ -2239,6 +2483,7 @@ int CeedOperatorLinearAssembleAddPointBlockDiagonal(CeedOperator op, CeedVector // Operator fallback CeedOperator op_fallback; + CeedDebug(CeedOperatorReturnCeed(op), "\nFalling back for CeedOperatorLinearAssembleAddPointBlockDiagonal\n"); CeedCall(CeedOperatorGetFallback(op, &op_fallback)); if (op_fallback) { CeedCall(CeedOperatorLinearAssembleAddPointBlockDiagonal(op_fallback, assembled, request)); @@ -2247,9 +2492,9 @@ int CeedOperatorLinearAssembleAddPointBlockDiagonal(CeedOperator op, CeedVector } // Default interface implementation if (is_composite) { - CeedCall(CeedCompositeOperatorLinearAssembleAddDiagonal(op, request, true, assembled)); + CeedCall(CeedOperatorLinearAssembleAddDiagonalComposite(op, request, true, assembled)); } else { - CeedCall(CeedSingleOperatorLinearAssembleAddDiagonal(op, request, true, assembled)); + CeedCall(CeedOperatorLinearAssembleAddDiagonalSingle(op, request, true, assembled)); } return CEED_ERROR_SUCCESS; } @@ -2291,6 +2536,7 @@ int CeedOperatorLinearAssembleSymbolic(CeedOperator op, CeedSize *num_entries, C // Operator fallback CeedOperator op_fallback; + CeedDebug(CeedOperatorReturnCeed(op), "\nFalling back for CeedOperatorLinearAssembleSymbolic\n"); CeedCall(CeedOperatorGetFallback(op, &op_fallback)); if (op_fallback) { CeedCall(CeedOperatorLinearAssembleSymbolic(op_fallback, num_entries, rows, cols)); @@ -2301,32 +2547,21 @@ int CeedOperatorLinearAssembleSymbolic(CeedOperator op, CeedSize *num_entries, C // Default interface implementation // Count entries and allocate rows, cols arrays - *num_entries = 0; - if (is_composite) { - CeedCall(CeedCompositeOperatorGetNumSub(op, &num_suboperators)); - CeedCall(CeedCompositeOperatorGetSubList(op, &sub_operators)); - for (CeedInt k = 0; k < num_suboperators; ++k) { - CeedCall(CeedSingleOperatorAssemblyCountEntries(sub_operators[k], &single_entries)); - *num_entries += single_entries; - } - } else { - CeedCall(CeedSingleOperatorAssemblyCountEntries(op, &single_entries)); - *num_entries += single_entries; - } + CeedCall(CeedOperatorLinearAssembleGetNumEntries(op, num_entries)); CeedCall(CeedCalloc(*num_entries, rows)); CeedCall(CeedCalloc(*num_entries, cols)); // Assemble nonzero locations if (is_composite) { - CeedCall(CeedCompositeOperatorGetNumSub(op, &num_suboperators)); - CeedCall(CeedCompositeOperatorGetSubList(op, &sub_operators)); + CeedCall(CeedOperatorCompositeGetNumSub(op, &num_suboperators)); + CeedCall(CeedOperatorCompositeGetSubList(op, &sub_operators)); for (CeedInt k = 0; k < num_suboperators; ++k) { - CeedCall(CeedSingleOperatorAssembleSymbolic(sub_operators[k], offset, *rows, *cols)); - CeedCall(CeedSingleOperatorAssemblyCountEntries(sub_operators[k], &single_entries)); + CeedCall(CeedOperatorAssembleSymbolicSingle(sub_operators[k], offset, *rows, *cols)); + CeedCall(CeedOperatorAssemblyCountEntriesSingle(sub_operators[k], &single_entries)); offset += single_entries; } } else { - CeedCall(CeedSingleOperatorAssembleSymbolic(op, offset, *rows, *cols)); + CeedCall(CeedOperatorAssembleSymbolicSingle(op, offset, *rows, *cols)); } return CEED_ERROR_SUCCESS; } @@ -2370,10 +2605,26 @@ int CeedOperatorLinearAssemble(CeedOperator op, CeedVector values) { // Backend version CeedCall(op->LinearAssemble(op, values)); return CEED_ERROR_SUCCESS; + } else if (is_composite) { + // Default to summing contributions of suboperators + CeedCall(CeedVectorSetValue(values, 0.0)); + CeedCall(CeedOperatorCompositeGetNumSub(op, &num_suboperators)); + CeedCall(CeedOperatorCompositeGetSubList(op, &sub_operators)); + for (CeedInt k = 0; k < num_suboperators; k++) { + CeedCall(CeedOperatorAssembleSingle(sub_operators[k], offset, values)); + CeedCall(CeedOperatorAssemblyCountEntriesSingle(sub_operators[k], &single_entries)); + offset += single_entries; + } + return CEED_ERROR_SUCCESS; + } else if (op->LinearAssembleSingle) { + CeedCall(CeedVectorSetValue(values, 0.0)); + CeedCall(CeedOperatorAssembleSingle(op, offset, values)); + return CEED_ERROR_SUCCESS; } else { // Operator fallback CeedOperator op_fallback; + CeedDebug(CeedOperatorReturnCeed(op), "\nFalling back for CeedOperatorLinearAssemble\n"); CeedCall(CeedOperatorGetFallback(op, &op_fallback)); if (op_fallback) { CeedCall(CeedOperatorLinearAssemble(op_fallback, values)); @@ -2381,19 +2632,9 @@ int CeedOperatorLinearAssemble(CeedOperator op, CeedVector values) { } } - // Default interface implementation + // Default to interface version if non-composite and no fallback CeedCall(CeedVectorSetValue(values, 0.0)); - if (is_composite) { - CeedCall(CeedCompositeOperatorGetNumSub(op, &num_suboperators)); - CeedCall(CeedCompositeOperatorGetSubList(op, &sub_operators)); - for (CeedInt k = 0; k < num_suboperators; k++) { - CeedCall(CeedSingleOperatorAssemble(sub_operators[k], offset, values)); - CeedCall(CeedSingleOperatorAssemblyCountEntries(sub_operators[k], &single_entries)); - offset += single_entries; - } - } else { - CeedCall(CeedSingleOperatorAssemble(op, offset, values)); - } + CeedCall(CeedOperatorAssembleSingle(op, offset, values)); return CEED_ERROR_SUCCESS; } @@ -2411,7 +2652,7 @@ int CeedOperatorLinearAssemble(CeedOperator op, CeedVector values) { @ref User **/ -int CeedCompositeOperatorGetMultiplicity(CeedOperator op, CeedInt num_skip_indices, CeedInt *skip_indices, CeedVector mult) { +int CeedOperatorCompositeGetMultiplicity(CeedOperator op, CeedInt num_skip_indices, CeedInt *skip_indices, CeedVector mult) { Ceed ceed; CeedInt num_suboperators; CeedSize l_vec_len; @@ -2422,19 +2663,19 @@ int CeedCompositeOperatorGetMultiplicity(CeedOperator op, CeedInt num_skip_indic CeedCall(CeedOperatorCheckReady(op)); - CeedCall(CeedOperatorGetCeed(op, &ceed)); - // Zero mult vector CeedCall(CeedVectorSetValue(mult, 0.0)); // Get suboperators - CeedCall(CeedCompositeOperatorGetNumSub(op, &num_suboperators)); - CeedCall(CeedCompositeOperatorGetSubList(op, &sub_operators)); + CeedCall(CeedOperatorCompositeGetNumSub(op, &num_suboperators)); if (num_suboperators == 0) return CEED_ERROR_SUCCESS; + CeedCall(CeedOperatorCompositeGetSubList(op, &sub_operators)); // Work vector CeedCall(CeedVectorGetLength(mult, &l_vec_len)); + CeedCall(CeedOperatorGetCeed(op, &ceed)); CeedCall(CeedVectorCreate(ceed, l_vec_len, &ones_l_vec)); + CeedCall(CeedDestroy(&ceed)); CeedCall(CeedVectorSetValue(ones_l_vec, 1.0)); CeedCall(CeedVectorGetArray(mult, CEED_MEM_HOST, &mult_array)); @@ -2451,13 +2692,14 @@ int CeedCompositeOperatorGetMultiplicity(CeedOperator op, CeedInt num_skip_indic // -- Sub operator multiplicity CeedCall(CeedOperatorGetActiveElemRestriction(sub_operators[i], &elem_rstr)); CeedCall(CeedElemRestrictionCreateUnorientedCopy(elem_rstr, &mult_elem_rstr)); + CeedCall(CeedElemRestrictionDestroy(&elem_rstr)); CeedCall(CeedElemRestrictionCreateVector(mult_elem_rstr, &sub_mult_l_vec, &ones_e_vec)); CeedCall(CeedVectorSetValue(sub_mult_l_vec, 0.0)); CeedCall(CeedElemRestrictionApply(mult_elem_rstr, CEED_NOTRANSPOSE, ones_l_vec, ones_e_vec, CEED_REQUEST_IMMEDIATE)); CeedCall(CeedElemRestrictionApply(mult_elem_rstr, CEED_TRANSPOSE, ones_e_vec, sub_mult_l_vec, CEED_REQUEST_IMMEDIATE)); CeedCall(CeedVectorGetArrayRead(sub_mult_l_vec, CEED_MEM_HOST, &sub_mult_array)); // ---- Flag every node present in the current suboperator - for (CeedInt j = 0; j < l_vec_len; j++) { + for (CeedSize j = 0; j < l_vec_len; j++) { if (sub_mult_array[j] > 0.0) mult_array[j] += 1.0; } CeedCall(CeedVectorRestoreArrayRead(sub_mult_l_vec, &sub_mult_array)); @@ -2499,10 +2741,12 @@ int CeedOperatorMultigridLevelCreate(CeedOperator op_fine, CeedVector p_mult_fin CeedCall(CeedOperatorGetActiveBasis(op_fine, &basis_fine)); CeedCall(CeedBasisCreateProjection(basis_coarse, basis_fine, &basis_c_to_f)); + CeedCall(CeedBasisDestroy(&basis_fine)); } // Core code - CeedCall(CeedSingleOperatorMultigridLevel(op_fine, p_mult_fine, rstr_coarse, basis_coarse, basis_c_to_f, op_coarse, op_prolong, op_restrict)); + CeedCall(CeedOperatorMultigridLevelCreateSingle_Core(op_fine, p_mult_fine, rstr_coarse, basis_coarse, basis_c_to_f, op_coarse, op_prolong, + op_restrict)); return CEED_ERROR_SUCCESS; } @@ -2538,7 +2782,10 @@ int CeedOperatorMultigridLevelCreateTensorH1(CeedOperator op_fine, CeedVector p_ CeedCall(CeedOperatorGetActiveBasis(op_fine, &basis_fine)); CeedCall(CeedBasisGetNumQuadraturePoints(basis_fine, &Q_f)); CeedCall(CeedBasisGetNumQuadraturePoints(basis_coarse, &Q_c)); - CeedCheck(Q_f == Q_c, ceed, CEED_ERROR_DIMENSION, "Bases must have compatible quadrature spaces"); + CeedCheck(Q_f == Q_c, ceed, CEED_ERROR_DIMENSION, + "Bases must have compatible quadrature spaces." + " Fine grid: %" CeedInt_FMT " points, Coarse grid: %" CeedInt_FMT " points", + Q_f, Q_c); // Create coarse to fine basis, if required if (op_prolong || op_restrict) { @@ -2551,6 +2798,7 @@ int CeedOperatorMultigridLevelCreateTensorH1(CeedOperator op_fine, CeedVector p_ CeedCall(CeedBasisGetDimension(basis_fine, &dim)); CeedCall(CeedBasisGetNumComponents(basis_fine, &num_comp)); CeedCall(CeedBasisGetNumNodes1D(basis_fine, &P_1d_f)); + CeedCall(CeedBasisDestroy(&basis_fine)); CeedCall(CeedElemRestrictionGetElementSize(rstr_coarse, &num_nodes_c)); P_1d_c = dim == 1 ? num_nodes_c : dim == 2 ? sqrt(num_nodes_c) : cbrt(num_nodes_c); CeedCall(CeedCalloc(P_1d_f, &q_ref)); @@ -2563,7 +2811,9 @@ int CeedOperatorMultigridLevelCreateTensorH1(CeedOperator op_fine, CeedVector p_ } // Core code - CeedCall(CeedSingleOperatorMultigridLevel(op_fine, p_mult_fine, rstr_coarse, basis_coarse, basis_c_to_f, op_coarse, op_prolong, op_restrict)); + CeedCall(CeedOperatorMultigridLevelCreateSingle_Core(op_fine, p_mult_fine, rstr_coarse, basis_coarse, basis_c_to_f, op_coarse, op_prolong, + op_restrict)); + CeedCall(CeedDestroy(&ceed)); return CEED_ERROR_SUCCESS; } @@ -2614,6 +2864,7 @@ int CeedOperatorMultigridLevelCreateH1(CeedOperator op_fine, CeedVector p_mult_f CeedCall(CeedBasisGetDimension(basis_fine, &dim)); CeedCall(CeedBasisGetNumComponents(basis_fine, &num_comp)); CeedCall(CeedBasisGetNumNodes(basis_fine, &num_nodes_f)); + CeedCall(CeedBasisDestroy(&basis_fine)); CeedCall(CeedElemRestrictionGetElementSize(rstr_coarse, &num_nodes_c)); CeedCall(CeedCalloc(num_nodes_f * dim, &q_ref)); CeedCall(CeedCalloc(num_nodes_f, &q_weight)); @@ -2625,7 +2876,9 @@ int CeedOperatorMultigridLevelCreateH1(CeedOperator op_fine, CeedVector p_mult_f } // Core code - CeedCall(CeedSingleOperatorMultigridLevel(op_fine, p_mult_fine, rstr_coarse, basis_coarse, basis_c_to_f, op_coarse, op_prolong, op_restrict)); + CeedCall(CeedOperatorMultigridLevelCreateSingle_Core(op_fine, p_mult_fine, rstr_coarse, basis_coarse, basis_c_to_f, op_coarse, op_prolong, + op_restrict)); + CeedCall(CeedDestroy(&ceed)); return CEED_ERROR_SUCCESS; } @@ -2652,7 +2905,6 @@ int CeedOperatorCreateFDMElementInverse(CeedOperator op, CeedOperator *fdm_inv, Ceed ceed, ceed_parent; bool interp = false, grad = false, is_tensor_basis = true; CeedInt num_input_fields, P_1d, Q_1d, num_nodes, num_qpts, dim, num_comp = 1, num_elem = 1; - CeedSize l_size = 1; CeedScalar *mass, *laplace, *x, *fdm_interp, *lambda, *elem_avg; const CeedScalar *interp_1d, *grad_1d, *q_weight_1d; CeedVector q_data; @@ -2673,6 +2925,7 @@ int CeedOperatorCreateFDMElementInverse(CeedOperator op, CeedOperator *fdm_inv, // Operator fallback CeedOperator op_fallback; + CeedDebug(CeedOperatorReturnCeed(op), "\nFalling back for CeedOperatorCreateFDMElementInverse\n"); CeedCall(CeedOperatorGetFallback(op, &op_fallback)); if (op_fallback) { CeedCall(CeedOperatorCreateFDMElementInverse(op_fallback, fdm_inv, request)); @@ -2698,9 +2951,10 @@ int CeedOperatorCreateFDMElementInverse(CeedOperator op, CeedOperator *fdm_inv, CeedCall(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode)); interp = interp || eval_mode == CEED_EVAL_INTERP; grad = grad || eval_mode == CEED_EVAL_GRAD; - CeedCall(CeedOperatorFieldGetBasis(op_fields[i], &basis)); - CeedCall(CeedOperatorFieldGetElemRestriction(op_fields[i], &rstr)); + if (!basis) CeedCall(CeedOperatorFieldGetBasis(op_fields[i], &basis)); + if (!rstr) CeedCall(CeedOperatorFieldGetElemRestriction(op_fields[i], &rstr)); } + CeedCall(CeedVectorDestroy(&vec)); } CeedCheck(basis, ceed, CEED_ERROR_BACKEND, "No active field set"); CeedCall(CeedBasisGetNumNodes1D(basis, &P_1d)); @@ -2710,7 +2964,6 @@ int CeedOperatorCreateFDMElementInverse(CeedOperator op, CeedOperator *fdm_inv, CeedCall(CeedBasisGetDimension(basis, &dim)); CeedCall(CeedBasisGetNumComponents(basis, &num_comp)); CeedCall(CeedElemRestrictionGetNumElements(rstr, &num_elem)); - CeedCall(CeedElemRestrictionGetLVectorSize(rstr, &l_size)); // Build and diagonalize 1D Mass and Laplacian CeedCall(CeedBasisIsTensor(basis, &is_tensor_basis)); @@ -2802,8 +3055,9 @@ int CeedOperatorCreateFDMElementInverse(CeedOperator op, CeedOperator *fdm_inv, CeedCall(CeedVectorGetArrayWrite(q_data, CEED_MEM_HOST, &q_data_array)); for (CeedInt e = 0; e < num_elem; e++) { for (CeedInt c = 0; c < num_comp; c++) { - for (CeedInt n = 0; n < num_nodes; n++) + for (CeedInt n = 0; n < num_nodes; n++) { q_data_array[(e * num_comp + c) * num_nodes + n] = 1. / (elem_avg[e] * fdm_diagonal[c * num_nodes + n]); + } } } CeedCall(CeedFree(&elem_avg)); @@ -2830,7 +3084,8 @@ int CeedOperatorCreateFDMElementInverse(CeedOperator op, CeedOperator *fdm_inv, // -- Restriction { CeedInt strides[3] = {1, num_nodes, num_nodes * num_comp}; - CeedCall(CeedElemRestrictionCreateStrided(ceed_parent, num_elem, num_nodes, num_comp, num_elem * num_comp * num_nodes, strides, &rstr_qd_i)); + CeedCall(CeedElemRestrictionCreateStrided(ceed_parent, num_elem, num_nodes, num_comp, + (CeedSize)num_elem * (CeedSize)num_comp * (CeedSize)num_nodes, strides, &rstr_qd_i)); } // -- QFunction @@ -2859,9 +3114,14 @@ int CeedOperatorCreateFDMElementInverse(CeedOperator op, CeedOperator *fdm_inv, CeedCall(CeedOperatorSetField(*fdm_inv, "output", rstr, fdm_basis, CEED_VECTOR_ACTIVE)); // Cleanup + CeedCall(CeedDestroy(&ceed)); + CeedCall(CeedDestroy(&ceed_parent)); CeedCall(CeedVectorDestroy(&q_data)); - CeedCall(CeedBasisDestroy(&fdm_basis)); + CeedCall(CeedElemRestrictionDestroy(&rstr)); CeedCall(CeedElemRestrictionDestroy(&rstr_qd_i)); + CeedCall(CeedBasisDestroy(&basis)); + CeedCall(CeedBasisDestroy(&fdm_basis)); + CeedCall(CeedQFunctionDestroy(&qf)); CeedCall(CeedQFunctionDestroy(&qf_fdm)); return CEED_ERROR_SUCCESS; } diff --git a/interface/ceed-qfunction-register.c b/interface/ceed-qfunction-register.c index 3558d0a225..eb3832c4f5 100644 --- a/interface/ceed-qfunction-register.c +++ b/interface/ceed-qfunction-register.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -32,7 +32,7 @@ int CeedQFunctionRegisterAll(void) { CeedPragmaCritical(CeedQFunctionRegisterAll) { if (!register_all_called) { - CeedDebugEnv256(1, "\n---------- Registering Gallery QFunctions ----------\n"); + CeedDebugEnv256(CEED_DEBUG_COLOR_SUCCESS, "\n---------- Registering Gallery QFunctions ----------\n"); #define CEED_GALLERY_QFUNCTION(name) \ if (!ierr) ierr = name(); #include "../gallery/ceed-gallery-list.h" diff --git a/interface/ceed-qfunction.c b/interface/ceed-qfunction.c index 226b33c19e..8f2ffbbd70 100644 --- a/interface/ceed-qfunction.c +++ b/interface/ceed-qfunction.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -117,13 +117,14 @@ static int CeedQFunctionFieldSet(CeedQFunctionField *f, const char *field_name, @param[in] field `CeedQFunction` field to view @param[in] field_number Number of field being viewed @param[in] in true for input field, false for output + @param[in] tabs Tabs to append before each new line @param[in] stream Stream to view to, e.g., `stdout` @return An error code: 0 - success, otherwise - failure @ref Utility **/ -static int CeedQFunctionFieldView(CeedQFunctionField field, CeedInt field_number, bool in, FILE *stream) { +static int CeedQFunctionFieldView(CeedQFunctionField field, CeedInt field_number, bool in, const char *tabs, FILE *stream) { const char *inout = in ? "Input" : "Output"; const char *field_name; CeedInt size; @@ -131,13 +132,42 @@ static int CeedQFunctionFieldView(CeedQFunctionField field, CeedInt field_number CeedCall(CeedQFunctionFieldGetData(field, &field_name, &size, &eval_mode)); fprintf(stream, - " %s field %" CeedInt_FMT - ":\n" - " Name: \"%s\"\n" + "%s %s field %" CeedInt_FMT + ":\n%s" + " Name: \"%s\"\n%s" " Size: %" CeedInt_FMT - "\n" + "\n%s" " EvalMode: \"%s\"\n", - inout, field_number, field_name, size, CeedEvalModes[eval_mode]); + tabs, inout, field_number, tabs, field_name, tabs, size, tabs, CeedEvalModes[eval_mode]); + return CEED_ERROR_SUCCESS; +} + +/** + @brief View a `CeedQFunction` passed as a `CeedObject` + + @param[in] qf `CeedQFunction` to view + @param[in] stream Filestream to write to + + @return An error code: 0 - success, otherwise - failure + + @ref Developer +**/ +static int CeedQFunctionView_Object(CeedObject qf, FILE *stream) { + CeedCall(CeedQFunctionView((CeedQFunction)qf, stream)); + return CEED_ERROR_SUCCESS; +} + +/** + @brief Destroy a `CeedQFunction` passed as a `CeedObject` + + @param[in,out] qf Address of `CeedQFunction` to destroy + + @return An error code: 0 - success, otherwise - failure + + @ref Developer +**/ +static int CeedQFunctionDestroy_Object(CeedObject *qf) { + CeedCall(CeedQFunctionDestroy((CeedQFunction *)qf)); return CEED_ERROR_SUCCESS; } @@ -196,11 +226,31 @@ int CeedQFunctionGetNumArgs(CeedQFunction qf, CeedInt *num_input, CeedInt *num_o return CEED_ERROR_SUCCESS; } +/** + @brief Get the name of the `CeedQFunction`. + Use the `name` if created via @ref CeedQFunctionCreateInteriorByName(), otherwise return the kernel name via @ref CeedQFunctionGetKernelName(). + + @param[in] qf `CeedQFunction` + @param[out] name Variable to store `CeedQFunction` name + + @return An error code: 0 - success, otherwise - failure + + @ref Backend +**/ +int CeedQFunctionGetName(CeedQFunction qf, const char **name) { + if (qf->is_gallery) { + *name = qf->gallery_name; + } else { + CeedCall(CeedQFunctionGetKernelName(qf, name)); + } + return CEED_ERROR_SUCCESS; +} + /** @brief Get the name of the user function for a `CeedQFunction` @param[in] qf `CeedQFunction` - @param[out] kernel_name Variable to store source path string + @param[out] kernel_name Variable to store string holding kernel name @return An error code: 0 - success, otherwise - failure @@ -252,6 +302,7 @@ int CeedQFunctionGetSourcePath(CeedQFunction qf, const char **source_path) { } else { CeedCall(CeedGetJitAbsolutePath(ceed, qf->user_source, &absolute_path)); } + CeedCall(CeedDestroy(&ceed)); size_t source_len = strlen(absolute_path) - kernel_name_len - 1; @@ -295,6 +346,7 @@ int CeedQFunctionLoadSourceToBuffer(CeedQFunction qf, const char **source_buffer CeedCall(CeedQFunctionGetCeed(qf, &ceed)); CeedCall(CeedLoadSourceToBuffer(ceed, source_path, &buffer)); + CeedCall(CeedDestroy(&ceed)); *source_buffer = buffer; } return CEED_ERROR_SUCCESS; @@ -328,7 +380,8 @@ int CeedQFunctionGetUserFunction(CeedQFunction qf, CeedQFunctionUser *f) { @ref Backend **/ int CeedQFunctionGetContext(CeedQFunction qf, CeedQFunctionContext *ctx) { - *ctx = qf->ctx; + *ctx = NULL; + if (qf->ctx) CeedCall(CeedQFunctionContextReferenceCopy(qf->ctx, ctx)); return CEED_ERROR_SUCCESS; } @@ -359,6 +412,7 @@ int CeedQFunctionGetContextData(CeedQFunction qf, CeedMemType mem_type, void *da } else { *(void **)data = NULL; } + CeedCall(CeedQFunctionContextDestroy(&ctx)); return CEED_ERROR_SUCCESS; } @@ -385,7 +439,7 @@ int CeedQFunctionRestoreContextData(CeedQFunction qf, void *data) { CeedCall(CeedQFunctionContextRestoreDataRead(ctx, data)); } } - *(void **)data = NULL; + CeedCall(CeedQFunctionContextDestroy(&ctx)); return CEED_ERROR_SUCCESS; } @@ -413,6 +467,7 @@ int CeedQFunctionGetInnerContext(CeedQFunction qf, CeedQFunctionContext *ctx) { } else { *ctx = qf_ctx; } + CeedCall(CeedQFunctionContextDestroy(&qf_ctx)); return CEED_ERROR_SUCCESS; } @@ -572,7 +627,7 @@ int CeedQFunctionSetImmutable(CeedQFunction qf) { @ref Backend **/ int CeedQFunctionReference(CeedQFunction qf) { - qf->ref_count++; + CeedCall(CeedObjectReference((CeedObject)qf)); return CEED_ERROR_SUCCESS; } @@ -608,6 +663,8 @@ int CeedQFunctionGetFlopsEstimate(CeedQFunction qf, CeedSize *flops) { @param[in] source Absolute path to source of `CeedQFunctionUser`, "\abs_path\file.h:function_name". The entire source file must only contain constructs supported by all targeted backends (i.e. CUDA for `/gpu/cuda`, OpenCL/SYCL for `/gpu/sycl`, etc.). The entire contents of this file and all locally included files are used during JiT compilation for GPU backends. + The header `ceed/types.h` is preferred over `ceed.h` or `ceed/ceed.h` for `CeedQFunction` source files. + The macro `CEED_RUNNING_JIT_PASS` is set during JiT and can be used to guard include statements that JiT compilers cannot use, such as `math.h` or `std*.h`. All source files must be at the provided filepath at runtime for JiT to function. @param[out] qf Address of the variable where the newly created `CeedQFunction` will be stored @@ -624,8 +681,9 @@ int CeedQFunctionCreateInterior(Ceed ceed, CeedInt vec_length, CeedQFunctionUser Ceed delegate; CeedCall(CeedGetObjectDelegate(ceed, &delegate, "QFunction")); - CeedCheck(delegate, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not support CeedQFunctionCreateInterior"); + CeedCheck(delegate, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement CeedQFunctionCreateInterior"); CeedCall(CeedQFunctionCreateInterior(delegate, vec_length, f, source, qf)); + CeedCall(CeedDestroy(&delegate)); return CEED_ERROR_SUCCESS; } @@ -633,8 +691,7 @@ int CeedQFunctionCreateInterior(Ceed ceed, CeedInt vec_length, CeedQFunctionUser "Provided path to source does not include function name. Provided: \"%s\"\nRequired: \"\\abs_path\\file.h:function_name\"", source); CeedCall(CeedCalloc(1, qf)); - CeedCall(CeedReferenceCopy(ceed, &(*qf)->ceed)); - (*qf)->ref_count = 1; + CeedCall(CeedObjectCreate(ceed, CeedQFunctionView_Object, CeedQFunctionDestroy_Object, &(*qf)->obj)); (*qf)->vec_length = vec_length; (*qf)->is_identity = false; (*qf)->is_context_writable = true; @@ -725,6 +782,7 @@ int CeedQFunctionCreateIdentity(Ceed ceed, CeedInt size, CeedEvalMode in_mode, C CeedCall(CeedQFunctionGetContext(*qf, &ctx)); CeedCall(CeedQFunctionContextGetFieldLabel(ctx, "size", &size_label)); CeedCall(CeedQFunctionContextSetInt32(ctx, size_label, &size)); + CeedCall(CeedQFunctionContextDestroy(&ctx)); return CEED_ERROR_SUCCESS; } @@ -755,30 +813,37 @@ int CeedQFunctionReferenceCopy(CeedQFunction qf, CeedQFunction *qf_copy) { @param[in,out] qf `CeedQFunction` @param[in] field_name Name of `CeedQFunction` field - @param[in] size Size of `CeedQFunction` field, (`num_comp * 1`) for @ref CEED_EVAL_NONE, (`num_comp * 1`) for @ref CEED_EVAL_INTERP for an \f$H^1\f$ space or (`num_comp * dim`) for an \f$H(\mathrm{div})\f$ or \f$H(\mathrm{curl})\f$ space, (`num_comp * dim`) for @ref CEED_EVAL_GRAD, or (`num_comp * 1`) for @ref CEED_EVAL_DIV, and (`num_comp * curl_dim`) with `curl_dim = 1` if `dim < 3` otherwise `curl_dim = dim` for @ref CEED_EVAL_CURL. + @param[in] size Size of `CeedQFunction` field, + (`num_comp * 1`) for @ref CEED_EVAL_NONE, + (`num_comp * 1`) for @ref CEED_EVAL_INTERP for an \f$H^1\f$ space or (`num_comp * dim`) for an \f$H(\mathrm{div})\f$ or \f$H(\mathrm{curl})\f$ space, + (`num_comp * dim`) for @ref CEED_EVAL_GRAD, + (`num_comp * 1`) for @ref CEED_EVAL_DIV, and + (`num_comp * curl_dim`) with `curl_dim = 1` if `dim < 3` otherwise `curl_dim = dim` for @ref CEED_EVAL_CURL. @param[in] eval_mode @ref CEED_EVAL_NONE to use values directly, @ref CEED_EVAL_INTERP to use interpolated values, @ref CEED_EVAL_GRAD to use gradients, @ref CEED_EVAL_DIV to use divergence, @ref CEED_EVAL_CURL to use curl + Note: In the user `CeedQFunctionUser`, the `in` argument list the fields in the order given by the calls to `CeedQFunctionAddInput`. + @return An error code: 0 - success, otherwise - failure @ref User **/ int CeedQFunctionAddInput(CeedQFunction qf, const char *field_name, CeedInt size, CeedEvalMode eval_mode) { bool is_immutable; - Ceed ceed; - CeedCall(CeedQFunctionGetCeed(qf, &ceed)); CeedCall(CeedQFunctionIsImmutable(qf, &is_immutable)); - CeedCheck(!is_immutable, ceed, CEED_ERROR_MAJOR, "QFunction cannot be changed after set as immutable"); - CeedCheck(eval_mode != CEED_EVAL_WEIGHT || size == 1, ceed, CEED_ERROR_DIMENSION, "CEED_EVAL_WEIGHT should have size 1"); + CeedCheck(!is_immutable, CeedQFunctionReturnCeed(qf), CEED_ERROR_MAJOR, "QFunction cannot be changed after set as immutable"); + CeedCheck(eval_mode != CEED_EVAL_WEIGHT || size == 1, CeedQFunctionReturnCeed(qf), CEED_ERROR_DIMENSION, "CEED_EVAL_WEIGHT should have size 1"); for (CeedInt i = 0; i < qf->num_input_fields; i++) { - CeedCheck(strcmp(field_name, qf->input_fields[i]->field_name), ceed, CEED_ERROR_MINOR, "CeedQFunction field names must be unique"); + CeedCheck(strcmp(field_name, qf->input_fields[i]->field_name), CeedQFunctionReturnCeed(qf), CEED_ERROR_MINOR, + "CeedQFunction field names must be unique. Duplicate name: %s", field_name); } for (CeedInt i = 0; i < qf->num_output_fields; i++) { - CeedCheck(strcmp(field_name, qf->output_fields[i]->field_name), ceed, CEED_ERROR_MINOR, "CeedQFunction field names must be unique"); + CeedCheck(strcmp(field_name, qf->output_fields[i]->field_name), CeedQFunctionReturnCeed(qf), CEED_ERROR_MINOR, + "CeedQFunction field names must be unique. Duplicate name: %s", field_name); } CeedCall(CeedQFunctionFieldSet(&qf->input_fields[qf->num_input_fields], field_name, size, eval_mode)); qf->num_input_fields++; @@ -790,30 +855,38 @@ int CeedQFunctionAddInput(CeedQFunction qf, const char *field_name, CeedInt size @param[in,out] qf `CeedQFunction` @param[in] field_name Name of `CeedQFunction` field - @param[in] size Size of `CeedQFunction` field, (`num_comp * 1`) for @ref CEED_EVAL_NONE, (`num_comp * 1`) for @ref CEED_EVAL_INTERP for an \f$H^1\f$ space or (`num_comp * dim`) for an \f$H(\mathrm{div})\f$ or \f$H(\mathrm{curl})\f$ space, (`num_comp * dim`) for @ref CEED_EVAL_GRAD, or (`num_comp * 1`) for @ref CEED_EVAL_DIV, and (`num_comp * curl_dim`) with `curl_dim = 1` if `dim < 3` else dim for @ref CEED_EVAL_CURL. + @param[in] size Size of `CeedQFunction` field, + (`num_comp * 1`) for @ref CEED_EVAL_NONE, + (`num_comp * 1`) for @ref CEED_EVAL_INTERP for an \f$H^1\f$ space or (`num_comp * dim`) for an \f$H(\mathrm{div})\f$ or \f$H(\mathrm{curl})\f$ space, + (`num_comp * dim`) for @ref CEED_EVAL_GRAD, + (`num_comp * 1`) for @ref CEED_EVAL_DIV, and + (`num_comp * curl_dim`) with `curl_dim = 1` if `dim < 3` otherwise `curl_dim = dim` for @ref CEED_EVAL_CURL. @param[in] eval_mode @ref CEED_EVAL_NONE to use values directly, @ref CEED_EVAL_INTERP to use interpolated values, @ref CEED_EVAL_GRAD to use gradients, @ref CEED_EVAL_DIV to use divergence, @ref CEED_EVAL_CURL to use curl. + Note: In the user `CeedQFunctionUser`, the `out` argument list the fields in the order given by the calls to `CeedQFunctionAddOutput`. + @return An error code: 0 - success, otherwise - failure @ref User **/ int CeedQFunctionAddOutput(CeedQFunction qf, const char *field_name, CeedInt size, CeedEvalMode eval_mode) { bool is_immutable; - Ceed ceed; - CeedCall(CeedQFunctionGetCeed(qf, &ceed)); CeedCall(CeedQFunctionIsImmutable(qf, &is_immutable)); - CeedCheck(!is_immutable, ceed, CEED_ERROR_MAJOR, "CeedQFunction cannot be changed after set as immutable"); - CeedCheck(eval_mode != CEED_EVAL_WEIGHT, ceed, CEED_ERROR_DIMENSION, "Cannot create CeedQFunction output with CEED_EVAL_WEIGHT"); + CeedCheck(!is_immutable, CeedQFunctionReturnCeed(qf), CEED_ERROR_MAJOR, "CeedQFunction cannot be changed after set as immutable"); + CeedCheck(eval_mode != CEED_EVAL_WEIGHT, CeedQFunctionReturnCeed(qf), CEED_ERROR_DIMENSION, + "Cannot create CeedQFunction output with CEED_EVAL_WEIGHT"); for (CeedInt i = 0; i < qf->num_input_fields; i++) { - CeedCheck(strcmp(field_name, qf->input_fields[i]->field_name), ceed, CEED_ERROR_MINOR, "CeedQFunction field names must be unique"); + CeedCheck(strcmp(field_name, qf->input_fields[i]->field_name), CeedQFunctionReturnCeed(qf), CEED_ERROR_MINOR, + "CeedQFunction field names must be unique"); } for (CeedInt i = 0; i < qf->num_output_fields; i++) { - CeedCheck(strcmp(field_name, qf->output_fields[i]->field_name), ceed, CEED_ERROR_MINOR, "CeedQFunction field names must be unique"); + CeedCheck(strcmp(field_name, qf->output_fields[i]->field_name), CeedQFunctionReturnCeed(qf), CEED_ERROR_MINOR, + "CeedQFunction field names must be unique"); } CeedCall(CeedQFunctionFieldSet(&qf->output_fields[qf->num_output_fields], field_name, size, eval_mode)); qf->num_output_fields++; @@ -966,6 +1039,36 @@ int CeedQFunctionSetUserFlopsEstimate(CeedQFunction qf, CeedSize flops) { return CEED_ERROR_SUCCESS; } +/** + @brief Set the number of tabs to indent for @ref CeedQFunctionView() output + + @param[in] qf `CeedQFunction` to set the number of view tabs + @param[in] num_tabs Number of view tabs to set + + @return Error code: 0 - success, otherwise - failure + + @ref User +**/ +int CeedQFunctionSetNumViewTabs(CeedQFunction qf, CeedInt num_tabs) { + CeedCall(CeedObjectSetNumViewTabs((CeedObject)qf, num_tabs)); + return CEED_ERROR_SUCCESS; +} + +/** + @brief Get the number of tabs to indent for @ref CeedQFunctionView() output + + @param[in] qf `CeedQFunction` to get the number of view tabs + @param[out] num_tabs Number of view tabs + + @return Error code: 0 - success, otherwise - failure + + @ref User +**/ +int CeedQFunctionGetNumViewTabs(CeedQFunction qf, CeedInt *num_tabs) { + CeedCall(CeedObjectGetNumViewTabs((CeedObject)qf, num_tabs)); + return CEED_ERROR_SUCCESS; +} + /** @brief View a `CeedQFunction` @@ -977,20 +1080,30 @@ int CeedQFunctionSetUserFlopsEstimate(CeedQFunction qf, CeedSize flops) { @ref User **/ int CeedQFunctionView(CeedQFunction qf, FILE *stream) { - const char *kernel_name; + char *tabs = NULL; + const char *name; + + { + CeedInt num_tabs = 0; + + CeedCall(CeedQFunctionGetNumViewTabs(qf, &num_tabs)); + CeedCall(CeedCalloc(CEED_TAB_WIDTH * num_tabs + 1, &tabs)); + for (CeedInt i = 0; i < CEED_TAB_WIDTH * num_tabs; i++) tabs[i] = ' '; + } - CeedCall(CeedQFunctionGetKernelName(qf, &kernel_name)); - fprintf(stream, "%sCeedQFunction - %s\n", qf->is_gallery ? "Gallery " : "User ", qf->is_gallery ? qf->gallery_name : kernel_name); + CeedCall(CeedQFunctionGetName(qf, &name)); + fprintf(stream, "%s%sCeedQFunction - %s\n", tabs, qf->is_gallery ? "Gallery " : "User ", name); - fprintf(stream, " %" CeedInt_FMT " input field%s:\n", qf->num_input_fields, qf->num_input_fields > 1 ? "s" : ""); + fprintf(stream, "%s %" CeedInt_FMT " input field%s:\n", tabs, qf->num_input_fields, qf->num_input_fields > 1 ? "s" : ""); for (CeedInt i = 0; i < qf->num_input_fields; i++) { - CeedCall(CeedQFunctionFieldView(qf->input_fields[i], i, 1, stream)); + CeedCall(CeedQFunctionFieldView(qf->input_fields[i], i, 1, tabs, stream)); } - fprintf(stream, " %" CeedInt_FMT " output field%s:\n", qf->num_output_fields, qf->num_output_fields > 1 ? "s" : ""); + fprintf(stream, "%s %" CeedInt_FMT " output field%s:\n", tabs, qf->num_output_fields, qf->num_output_fields > 1 ? "s" : ""); for (CeedInt i = 0; i < qf->num_output_fields; i++) { - CeedCall(CeedQFunctionFieldView(qf->output_fields[i], i, 0, stream)); + CeedCall(CeedQFunctionFieldView(qf->output_fields[i], i, 0, tabs, stream)); } + CeedCall(CeedFree(&tabs)); return CEED_ERROR_SUCCESS; } @@ -1005,7 +1118,7 @@ int CeedQFunctionView(CeedQFunction qf, FILE *stream) { @ref Advanced **/ int CeedQFunctionGetCeed(CeedQFunction qf, Ceed *ceed) { - *ceed = CeedQFunctionReturnCeed(qf); + CeedCall(CeedObjectGetCeed((CeedObject)qf, ceed)); return CEED_ERROR_SUCCESS; } @@ -1018,7 +1131,7 @@ int CeedQFunctionGetCeed(CeedQFunction qf, Ceed *ceed) { @ref Advanced **/ -Ceed CeedQFunctionReturnCeed(CeedQFunction qf) { return qf->ceed; } +Ceed CeedQFunctionReturnCeed(CeedQFunction qf) { return CeedObjectReturnCeed((CeedObject)qf); } /** @brief Apply the action of a `CeedQFunction` @@ -1036,13 +1149,11 @@ Ceed CeedQFunctionReturnCeed(CeedQFunction qf) { return qf->ceed; } **/ int CeedQFunctionApply(CeedQFunction qf, CeedInt Q, CeedVector *u, CeedVector *v) { CeedInt vec_length; - Ceed ceed; - CeedCall(CeedQFunctionGetCeed(qf, &ceed)); - CeedCheck(qf->Apply, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not support CeedQFunctionApply"); + CeedCheck(qf->Apply, CeedQFunctionReturnCeed(qf), CEED_ERROR_UNSUPPORTED, "Backend does not support CeedQFunctionApply"); CeedCall(CeedQFunctionGetVectorLength(qf, &vec_length)); - CeedCheck(Q % vec_length == 0, ceed, CEED_ERROR_DIMENSION, "Number of quadrature points %" CeedInt_FMT " must be a multiple of %" CeedInt_FMT, Q, - qf->vec_length); + CeedCheck(Q % vec_length == 0, CeedQFunctionReturnCeed(qf), CEED_ERROR_DIMENSION, + "Number of quadrature points %" CeedInt_FMT " must be a multiple of %" CeedInt_FMT, Q, qf->vec_length); CeedCall(CeedQFunctionSetImmutable(qf)); CeedCall(qf->Apply(qf, Q, u, v)); return CEED_ERROR_SUCCESS; @@ -1058,7 +1169,7 @@ int CeedQFunctionApply(CeedQFunction qf, CeedInt Q, CeedVector *u, CeedVector *v @ref User **/ int CeedQFunctionDestroy(CeedQFunction *qf) { - if (!*qf || --(*qf)->ref_count > 0) { + if (!*qf || CeedObjectDereference((CeedObject)*qf) > 0) { *qf = NULL; return CEED_ERROR_SUCCESS; } @@ -1085,7 +1196,7 @@ int CeedQFunctionDestroy(CeedQFunction *qf) { CeedCall(CeedFree(&(*qf)->source_path)); CeedCall(CeedFree(&(*qf)->gallery_name)); CeedCall(CeedFree(&(*qf)->kernel_name)); - CeedCall(CeedDestroy(&(*qf)->ceed)); + CeedCall(CeedObjectDestroy_Private(&(*qf)->obj)); CeedCall(CeedFree(qf)); return CEED_ERROR_SUCCESS; } diff --git a/interface/ceed-qfunctioncontext.c b/interface/ceed-qfunctioncontext.c index ddb9549fa4..48563a9999 100644 --- a/interface/ceed-qfunctioncontext.c +++ b/interface/ceed-qfunctioncontext.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -59,12 +59,11 @@ int CeedQFunctionContextRegisterGeneric(CeedQFunctionContext ctx, const char *fi CeedContextFieldType field_type, size_t num_values) { size_t field_size = 0; CeedInt field_index = -1; - Ceed ceed; // Check for duplicate - CeedCall(CeedQFunctionContextGetCeed(ctx, &ceed)); CeedCall(CeedQFunctionContextGetFieldIndex(ctx, field_name, &field_index)); - CeedCheck(field_index == -1, ceed, CEED_ERROR_UNSUPPORTED, "QFunctionContext field with name \"%s\" already registered", field_name); + CeedCheck(field_index == -1, CeedQFunctionContextReturnCeed(ctx), CEED_ERROR_UNSUPPORTED, + "QFunctionContext field with name \"%s\" already registered", field_name); // Allocate space for field data if (ctx->num_fields == 0) { @@ -128,6 +127,35 @@ static int CeedQFunctionContextDestroyData(CeedQFunctionContext ctx) { return CEED_ERROR_SUCCESS; } +/** + @brief View a `CeedQFunctionContext` passed as a `CeedObject` + + @param[in] ctx `CeedQFunctionContext` to view + @param[in] stream Filestream to write to + + @return An error code: 0 - success, otherwise - failure + + @ref Developer +**/ +static int CeedQFunctionContextView_Object(CeedObject ctx, FILE *stream) { + CeedCall(CeedQFunctionContextView((CeedQFunctionContext)ctx, stream)); + return CEED_ERROR_SUCCESS; +} + +/** + @brief Destroy a `CeedQFunctionContext` passed as a `CeedObject` + + @param[in,out] ctx Address of `CeedQFunctionContext` to destroy + + @return An error code: 0 - success, otherwise - failure + + @ref Developer +**/ +static int CeedQFunctionContextDestroy_Object(CeedObject *ctx) { + CeedCall(CeedQFunctionContextDestroy((CeedQFunctionContext *)ctx)); + return CEED_ERROR_SUCCESS; +} + /// @} /// ---------------------------------------------------------------------------- @@ -147,7 +175,7 @@ static int CeedQFunctionContextDestroyData(CeedQFunctionContext ctx) { @ref Backend **/ int CeedQFunctionContextGetCeed(CeedQFunctionContext ctx, Ceed *ceed) { - *ceed = CeedQFunctionContextReturnCeed(ctx); + CeedCall(CeedObjectGetCeed((CeedObject)ctx, ceed)); return CEED_ERROR_SUCCESS; } @@ -160,7 +188,7 @@ int CeedQFunctionContextGetCeed(CeedQFunctionContext ctx, Ceed *ceed) { @ref Backend **/ -Ceed CeedQFunctionContextReturnCeed(CeedQFunctionContext ctx) { return ctx->ceed; } +Ceed CeedQFunctionContextReturnCeed(CeedQFunctionContext ctx) { return CeedObjectReturnCeed((CeedObject)ctx); } /** @brief Check for valid data in a `CeedQFunctionContext` @@ -542,7 +570,7 @@ int CeedQFunctionContextGetDataDestroy(CeedQFunctionContext ctx, CeedMemType *f_ @ref Backend **/ int CeedQFunctionContextReference(CeedQFunctionContext ctx) { - ctx->ref_count++; + CeedCall(CeedObjectReference((CeedObject)ctx)); return CEED_ERROR_SUCCESS; } @@ -569,14 +597,14 @@ int CeedQFunctionContextCreate(Ceed ceed, CeedQFunctionContext *ctx) { Ceed delegate; CeedCall(CeedGetObjectDelegate(ceed, &delegate, "Context")); - CeedCheck(delegate, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not support CeedQFunctionContextCreate"); + CeedCheck(delegate, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement CeedQFunctionContextCreate"); CeedCall(CeedQFunctionContextCreate(delegate, ctx)); + CeedCall(CeedDestroy(&delegate)); return CEED_ERROR_SUCCESS; } CeedCall(CeedCalloc(1, ctx)); - CeedCall(CeedReferenceCopy(ceed, &(*ctx)->ceed)); - (*ctx)->ref_count = 1; + CeedCall(CeedObjectCreate(ceed, CeedQFunctionContextView_Object, CeedQFunctionContextDestroy_Object, &(*ctx)->obj)); CeedCall(ceed->QFunctionContextCreate(*ctx)); return CEED_ERROR_SUCCESS; } @@ -620,11 +648,9 @@ int CeedQFunctionContextReferenceCopy(CeedQFunctionContext ctx, CeedQFunctionCon @ref User **/ int CeedQFunctionContextSetData(CeedQFunctionContext ctx, CeedMemType mem_type, CeedCopyMode copy_mode, size_t size, void *data) { - Ceed ceed; - - CeedCall(CeedQFunctionContextGetCeed(ctx, &ceed)); - CeedCheck(ctx->SetData, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not support CeedQFunctionContextSetData"); - CeedCheck(ctx->state % 2 == 0, ceed, 1, "Cannot grant CeedQFunctionContext data access, the access lock is already in use"); + CeedCheck(ctx->SetData, CeedQFunctionContextReturnCeed(ctx), CEED_ERROR_UNSUPPORTED, "Backend does not support CeedQFunctionContextSetData"); + CeedCheck(ctx->state % 2 == 0, CeedQFunctionContextReturnCeed(ctx), 1, + "Cannot grant CeedQFunctionContext data access, the access lock is already in use"); CeedCall(CeedQFunctionContextDestroyData(ctx)); ctx->ctx_size = size; @@ -650,17 +676,16 @@ int CeedQFunctionContextSetData(CeedQFunctionContext ctx, CeedMemType mem_type, int CeedQFunctionContextTakeData(CeedQFunctionContext ctx, CeedMemType mem_type, void *data) { void *temp_data = NULL; bool has_valid_data = true, has_borrowed_data_of_type = true; - Ceed ceed; - CeedCall(CeedQFunctionContextGetCeed(ctx, &ceed)); CeedCall(CeedQFunctionContextHasValidData(ctx, &has_valid_data)); - CeedCheck(has_valid_data, ceed, CEED_ERROR_BACKEND, "CeedQFunctionContext has no valid data to take, must set data"); + CeedCheck(has_valid_data, CeedQFunctionContextReturnCeed(ctx), CEED_ERROR_BACKEND, "CeedQFunctionContext has no valid data to take, must set data"); - CeedCheck(ctx->TakeData, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not support CeedQFunctionContextTakeData"); - CeedCheck(ctx->state % 2 == 0, ceed, 1, "Cannot grant CeedQFunctionContext data access, the access lock is already in use"); + CeedCheck(ctx->TakeData, CeedQFunctionContextReturnCeed(ctx), CEED_ERROR_UNSUPPORTED, "Backend does not support CeedQFunctionContextTakeData"); + CeedCheck(ctx->state % 2 == 0, CeedQFunctionContextReturnCeed(ctx), 1, + "Cannot grant CeedQFunctionContext data access, the access lock is already in use"); CeedCall(CeedQFunctionContextHasBorrowedDataOfType(ctx, mem_type, &has_borrowed_data_of_type)); - CeedCheck(has_borrowed_data_of_type, ceed, CEED_ERROR_BACKEND, + CeedCheck(has_borrowed_data_of_type, CeedQFunctionContextReturnCeed(ctx), CEED_ERROR_BACKEND, "CeedQFunctionContext has no borrowed %s data, must set data with CeedQFunctionContextSetData", CeedMemTypes[mem_type]); CeedCall(ctx->TakeData(ctx, mem_type, &temp_data)); @@ -687,15 +712,15 @@ int CeedQFunctionContextTakeData(CeedQFunctionContext ctx, CeedMemType mem_type, **/ int CeedQFunctionContextGetData(CeedQFunctionContext ctx, CeedMemType mem_type, void *data) { bool has_valid_data = true; - Ceed ceed; - CeedCall(CeedQFunctionContextGetCeed(ctx, &ceed)); - CeedCheck(ctx->GetData, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not support CeedQFunctionContextGetData"); - CeedCheck(ctx->state % 2 == 0, ceed, 1, "Cannot grant CeedQFunctionContext data access, the access lock is already in use"); - CeedCheck(ctx->num_readers == 0, ceed, 1, "Cannot grant CeedQFunctionContext data access, a process has read access"); + CeedCheck(ctx->GetData, CeedQFunctionContextReturnCeed(ctx), CEED_ERROR_UNSUPPORTED, "Backend does not support CeedQFunctionContextGetData"); + CeedCheck(ctx->state % 2 == 0, CeedQFunctionContextReturnCeed(ctx), 1, + "Cannot grant CeedQFunctionContext data access, the access lock is already in use"); + CeedCheck(ctx->num_readers == 0, CeedQFunctionContextReturnCeed(ctx), 1, + "Cannot grant CeedQFunctionContext data access, a process has read access"); CeedCall(CeedQFunctionContextHasValidData(ctx, &has_valid_data)); - CeedCheck(has_valid_data, ceed, CEED_ERROR_BACKEND, "CeedQFunctionContext has no valid data to get, must set data"); + CeedCheck(has_valid_data, CeedQFunctionContextReturnCeed(ctx), CEED_ERROR_BACKEND, "CeedQFunctionContext has no valid data to get, must set data"); CeedCall(ctx->GetData(ctx, mem_type, data)); ctx->state++; @@ -721,14 +746,14 @@ int CeedQFunctionContextGetData(CeedQFunctionContext ctx, CeedMemType mem_type, **/ int CeedQFunctionContextGetDataRead(CeedQFunctionContext ctx, CeedMemType mem_type, void *data) { bool has_valid_data = true; - Ceed ceed; - CeedCall(CeedQFunctionContextGetCeed(ctx, &ceed)); - CeedCheck(ctx->GetDataRead, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not support CeedQFunctionContextGetDataRead"); - CeedCheck(ctx->state % 2 == 0, ceed, 1, "Cannot grant CeedQFunctionContext data access, the access lock is already in use"); + CeedCheck(ctx->GetDataRead, CeedQFunctionContextReturnCeed(ctx), CEED_ERROR_UNSUPPORTED, + "Backend does not support CeedQFunctionContextGetDataRead"); + CeedCheck(ctx->state % 2 == 0, CeedQFunctionContextReturnCeed(ctx), 1, + "Cannot grant CeedQFunctionContext data access, the access lock is already in use"); CeedCall(CeedQFunctionContextHasValidData(ctx, &has_valid_data)); - CeedCheck(has_valid_data, ceed, CEED_ERROR_BACKEND, "CeedQFunctionContext has no valid data to get, must set data"); + CeedCheck(has_valid_data, CeedQFunctionContextReturnCeed(ctx), CEED_ERROR_BACKEND, "CeedQFunctionContext has no valid data to get, must set data"); CeedCall(ctx->GetDataRead(ctx, mem_type, data)); ctx->num_readers++; @@ -883,6 +908,36 @@ int CeedQFunctionContextGetContextSize(CeedQFunctionContext ctx, size_t *ctx_siz return CEED_ERROR_SUCCESS; } +/** + @brief Set the number of tabs to indent for @ref CeedQFunctionContextView() output + + @param[in] ctx `CeedQFunctionContext` to set the number of view tabs + @param[in] num_tabs Number of view tabs to set + + @return Error code: 0 - success, otherwise - failure + + @ref User +**/ +int CeedQFunctionContextSetNumViewTabs(CeedQFunctionContext ctx, CeedInt num_tabs) { + CeedCall(CeedObjectSetNumViewTabs((CeedObject)ctx, num_tabs)); + return CEED_ERROR_SUCCESS; +} + +/** + @brief Get the number of tabs to indent for @ref CeedQFunctionContextView() output + + @param[in] ctx `CeedQFunctionContext` to get the number of view tabs + @param[out] num_tabs Number of view tabs + + @return Error code: 0 - success, otherwise - failure + + @ref User +**/ +int CeedQFunctionContextGetNumViewTabs(CeedQFunctionContext ctx, CeedInt *num_tabs) { + CeedCall(CeedObjectGetNumViewTabs((CeedObject)ctx, num_tabs)); + return CEED_ERROR_SUCCESS; +} + /** @brief View a `CeedQFunctionContext` @@ -894,11 +949,22 @@ int CeedQFunctionContextGetContextSize(CeedQFunctionContext ctx, size_t *ctx_siz @ref User **/ int CeedQFunctionContextView(CeedQFunctionContext ctx, FILE *stream) { - fprintf(stream, "CeedQFunctionContext\n"); - fprintf(stream, " Context Data Size: %zu\n", ctx->ctx_size); + char *tabs = NULL; + + { + CeedInt num_tabs = 0; + + CeedCall(CeedQFunctionContextGetNumViewTabs(ctx, &num_tabs)); + CeedCall(CeedCalloc(CEED_TAB_WIDTH * num_tabs + 1, &tabs)); + for (CeedInt i = 0; i < CEED_TAB_WIDTH * num_tabs; i++) tabs[i] = ' '; + } + + fprintf(stream, "%sCeedQFunctionContext\n", tabs); + fprintf(stream, "%s Context Data Size: %zu\n", tabs, ctx->ctx_size); for (CeedInt i = 0; i < ctx->num_fields; i++) { - fprintf(stream, " Labeled %s field: %s\n", CeedContextFieldTypes[ctx->field_labels[i]->type], ctx->field_labels[i]->name); + fprintf(stream, "%s Labeled %s field: %s\n", tabs, CeedContextFieldTypes[ctx->field_labels[i]->type], ctx->field_labels[i]->name); } + CeedCall(CeedFree(&tabs)); return CEED_ERROR_SUCCESS; } @@ -930,11 +996,11 @@ int CeedQFunctionContextSetDataDestroy(CeedQFunctionContext ctx, CeedMemType f_m @ref User **/ int CeedQFunctionContextDestroy(CeedQFunctionContext *ctx) { - if (!*ctx || --(*ctx)->ref_count > 0) { + if (!*ctx || CeedObjectDereference((CeedObject)*ctx) > 0) { *ctx = NULL; return CEED_ERROR_SUCCESS; } - CeedCheck(((*ctx)->state % 2) == 0, (*ctx)->ceed, 1, "Cannot destroy CeedQFunctionContext, the access lock is in use"); + CeedCheck(((*ctx)->state % 2) == 0, CeedQFunctionContextReturnCeed(*ctx), 1, "Cannot destroy CeedQFunctionContext, the access lock is in use"); CeedCall(CeedQFunctionContextDestroyData(*ctx)); if ((*ctx)->Destroy) CeedCall((*ctx)->Destroy(*ctx)); @@ -944,7 +1010,7 @@ int CeedQFunctionContextDestroy(CeedQFunctionContext *ctx) { CeedCall(CeedFree(&(*ctx)->field_labels[i])); } CeedCall(CeedFree(&(*ctx)->field_labels)); - CeedCall(CeedDestroy(&(*ctx)->ceed)); + CeedCall(CeedObjectDestroy_Private(&(*ctx)->obj)); CeedCall(CeedFree(ctx)); return CEED_ERROR_SUCCESS; } diff --git a/interface/ceed-register.c b/interface/ceed-register.c index bdc8a95d10..759a6463fb 100644 --- a/interface/ceed-register.c +++ b/interface/ceed-register.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -32,7 +32,7 @@ int CeedRegisterAll(void) { CeedPragmaCritical(CeedRegisterAll) { if (!register_all_called) { - CeedDebugEnv256(1, "\n---------- Registering Backends ----------\n"); + CeedDebugEnv256(CEED_DEBUG_COLOR_SUCCESS, "\n---------- Registering Backends ----------\n"); #define CEED_BACKEND(name, ...) \ if (!ierr) ierr = name(); #include "../backends/ceed-backend-list.h" diff --git a/interface/ceed-tensor.c b/interface/ceed-tensor.c index 7cbc69e00c..24f3687c62 100644 --- a/interface/ceed-tensor.c +++ b/interface/ceed-tensor.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -13,6 +13,28 @@ /// @file /// Implementation of CeedTensorContract interfaces +/// ---------------------------------------------------------------------------- +/// CeedTensorContract Library Internal Functions +/// ---------------------------------------------------------------------------- +/// @addtogroup CeedTensorContractDeveloper +/// @{ + +/** + @brief Destroy a `CeedTensorContract` passed as a `CeedObject` + + @param[in,out] contract Address of `CeedTensorContract` to destroy + + @return An error code: 0 - success, otherwise - failure + + @ref Developer +**/ +static int CeedTensorContractDestroy_Object(CeedObject *contract) { + CeedCall(CeedTensorContractDestroy((CeedTensorContract *)contract)); + return CEED_ERROR_SUCCESS; +} + +/// @} + /// ---------------------------------------------------------------------------- /// CeedTensorContract Backend API /// ---------------------------------------------------------------------------- @@ -34,13 +56,14 @@ int CeedTensorContractCreate(Ceed ceed, CeedTensorContract *contract) { Ceed delegate; CeedCall(CeedGetObjectDelegate(ceed, &delegate, "TensorContract")); - CeedCheck(delegate, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not support CeedTensorContractCreate"); + CeedCheck(delegate, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement CeedTensorContractCreate"); CeedCall(CeedTensorContractCreate(delegate, contract)); + CeedCall(CeedDestroy(&delegate)); return CEED_ERROR_SUCCESS; } CeedCall(CeedCalloc(1, contract)); - CeedCall(CeedReferenceCopy(ceed, &(*contract)->ceed)); + CeedCall(CeedObjectCreate(ceed, NULL, CeedTensorContractDestroy_Object, &(*contract)->obj)); CeedCall(ceed->TensorContractCreate(*contract)); return CEED_ERROR_SUCCESS; } @@ -123,7 +146,7 @@ int CeedTensorContractStridedApply(CeedTensorContract contract, CeedInt A, CeedI @ref Backend **/ int CeedTensorContractGetCeed(CeedTensorContract contract, Ceed *ceed) { - *ceed = CeedTensorContractReturnCeed(contract); + CeedCall(CeedObjectGetCeed((CeedObject)contract, ceed)); return CEED_ERROR_SUCCESS; } @@ -136,7 +159,7 @@ int CeedTensorContractGetCeed(CeedTensorContract contract, Ceed *ceed) { @ref Backend **/ -Ceed CeedTensorContractReturnCeed(CeedTensorContract contract) { return contract->ceed; } +Ceed CeedTensorContractReturnCeed(CeedTensorContract contract) { return CeedObjectReturnCeed((CeedObject)contract); } /** @brief Get backend data of a `CeedTensorContract` @@ -178,7 +201,7 @@ int CeedTensorContractSetData(CeedTensorContract contract, void *data) { @ref Backend **/ int CeedTensorContractReference(CeedTensorContract contract) { - contract->ref_count++; + CeedCall(CeedObjectReference((CeedObject)contract)); return CEED_ERROR_SUCCESS; } @@ -214,14 +237,14 @@ int CeedTensorContractReferenceCopy(CeedTensorContract tensor, CeedTensorContrac @ref Backend **/ int CeedTensorContractDestroy(CeedTensorContract *contract) { - if (!*contract || --(*contract)->ref_count > 0) { + if (!*contract || CeedObjectDereference((CeedObject)*contract) > 0) { *contract = NULL; return CEED_ERROR_SUCCESS; } if ((*contract)->Destroy) { CeedCall((*contract)->Destroy(*contract)); } - CeedCall(CeedDestroy(&(*contract)->ceed)); + CeedCall(CeedObjectDestroy_Private(&(*contract)->obj)); CeedCall(CeedFree(contract)); return CEED_ERROR_SUCCESS; } diff --git a/interface/ceed-types.c b/interface/ceed-types.c index 564a5b009a..cbec562cff 100644 --- a/interface/ceed-types.c +++ b/interface/ceed-types.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause diff --git a/interface/ceed-vector.c b/interface/ceed-vector.c index 39b72c770f..eb9f4fc85b 100644 --- a/interface/ceed-vector.c +++ b/interface/ceed-vector.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -33,6 +33,43 @@ const CeedVector CEED_VECTOR_NONE = &ceed_vector_none; /// @} +/// ---------------------------------------------------------------------------- +/// CeedVector Internal Functions +/// ---------------------------------------------------------------------------- +/// @addtogroup CeedVectorDeveloper +/// @{ + +/** + @brief View a `CeedVector` passed as a `CeedObject` + + @param[in] vec `CeedVector` to view + @param[in] stream Filestream to write to + + @return An error code: 0 - success, otherwise - failure + + @ref Developer +**/ +static int CeedVectorView_Object(CeedObject vec, FILE *stream) { + CeedCall(CeedVectorView((CeedVector)vec, "%12.8f", stream)); + return CEED_ERROR_SUCCESS; +} + +/** + @brief Destroy a `CeedVector` passed as a `CeedObject` + + @param[in,out] vec Address of `CeedVector` to destroy + + @return An error code: 0 - success, otherwise - failure + + @ref Developer +**/ +static int CeedVectorDestroy_Object(CeedObject *vec) { + CeedCall(CeedVectorDestroy((CeedVector *)vec)); + return CEED_ERROR_SUCCESS; +} + +/// @} + /// ---------------------------------------------------------------------------- /// CeedVector Backend API /// ---------------------------------------------------------------------------- @@ -135,7 +172,7 @@ int CeedVectorSetData(CeedVector vec, void *data) { @ref Backend **/ int CeedVectorReference(CeedVector vec) { - vec->ref_count++; + CeedCall(CeedObjectReference((CeedObject)vec)); return CEED_ERROR_SUCCESS; } @@ -159,20 +196,21 @@ int CeedVectorReference(CeedVector vec) { @ref User **/ int CeedVectorCreate(Ceed ceed, CeedSize length, CeedVector *vec) { + CeedCheck(length >= 0, ceed, CEED_ERROR_UNSUPPORTED, "CeedVector must have length >= 0, received %" CeedSize_FMT, length); if (!ceed->VectorCreate) { Ceed delegate; CeedCall(CeedGetObjectDelegate(ceed, &delegate, "Vector")); - CeedCheck(delegate, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not support VectorCreate"); + CeedCheck(delegate, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement VectorCreate"); CeedCall(CeedVectorCreate(delegate, length, vec)); + CeedCall(CeedDestroy(&delegate)); return CEED_ERROR_SUCCESS; } CeedCall(CeedCalloc(1, vec)); - CeedCall(CeedReferenceCopy(ceed, &(*vec)->ceed)); - (*vec)->ref_count = 1; - (*vec)->length = length; - (*vec)->state = 0; + CeedCall(CeedObjectCreate(ceed, CeedVectorView_Object, CeedVectorDestroy_Object, &(*vec)->obj)); + (*vec)->length = length; + (*vec)->state = 0; CeedCall(ceed->VectorCreate(length, *vec)); return CEED_ERROR_SUCCESS; } @@ -202,34 +240,42 @@ int CeedVectorReferenceCopy(CeedVector vec, CeedVector *vec_copy) { /** @brief Copy a `CeedVector` into a different `CeedVector`. - Both pointers should be destroyed with @ref CeedVectorDestroy(). - - Note: If `*vec_copy` is non-`NULL`, then it is assumed that `*vec_copy` is a pointer to a `CeedVector`. - This `CeedVector` will be destroyed if `*vec_copy` is the only reference to this `CeedVector`. - @param[in] vec `CeedVector` to copy - @param[in,out] vec_copy Variable to store copied `CeedVector` to + @param[in,out] vec_copy `CeedVector` to copy array into @return An error code: 0 - success, otherwise - failure @ref User **/ int CeedVectorCopy(CeedVector vec, CeedVector vec_copy) { - Ceed ceed; CeedMemType mem_type, mem_type_copy; CeedScalar *array; - // Get the preferred memory type - CeedCall(CeedVectorGetCeed(vec, &ceed)); - CeedCall(CeedGetPreferredMemType(ceed, &mem_type)); + // Get the preferred memory types + { + Ceed ceed; - // Get the preferred memory type - CeedCall(CeedVectorGetCeed(vec_copy, &ceed)); - CeedCall(CeedGetPreferredMemType(ceed, &mem_type_copy)); + CeedCall(CeedVectorGetCeed(vec, &ceed)); + CeedCall(CeedGetPreferredMemType(ceed, &mem_type)); + CeedCall(CeedDestroy(&ceed)); + + CeedCall(CeedVectorGetCeed(vec_copy, &ceed)); + CeedCall(CeedGetPreferredMemType(ceed, &mem_type_copy)); + CeedCall(CeedDestroy(&ceed)); + } // Check that both have same memory type if (mem_type != mem_type_copy) mem_type = CEED_MEM_HOST; + // Check compatible lengths + { + CeedSize length_vec, length_copy; + + CeedCall(CeedVectorGetLength(vec, &length_vec)); + CeedCall(CeedVectorGetLength(vec_copy, &length_copy)); + CeedCheck(length_vec == length_copy, CeedVectorReturnCeed(vec), CEED_ERROR_INCOMPATIBLE, "CeedVectors must have the same length to copy"); + } + // Copy the values from vec to vec_copy CeedCall(CeedVectorGetArray(vec, mem_type, &array)); CeedCall(CeedVectorSetArray(vec_copy, mem_type, CEED_COPY_VALUES, array)); @@ -238,6 +284,57 @@ int CeedVectorCopy(CeedVector vec, CeedVector vec_copy) { return CEED_ERROR_SUCCESS; } +/** + @brief Copy a strided portion of `CeedVector` contents into a different `CeedVector` + + @param[in] vec `CeedVector` to copy + @param[in] start First index to copy in the range `[start, stop)` + @param[in] stop One past the last element to copy in the range, or `-1` for `length` + @param[in] step Stride between indices to copy + @param[in,out] vec_copy `CeedVector` to copy values to + + @return An error code: 0 - success, otherwise - failure + + @ref User +**/ +int CeedVectorCopyStrided(CeedVector vec, CeedSize start, CeedSize stop, CeedSize step, CeedVector vec_copy) { + CeedSize length; + const CeedScalar *array = NULL; + CeedScalar *array_copy = NULL; + + // Check length + { + CeedSize length_vec, length_copy; + + CeedCall(CeedVectorGetLength(vec, &length_vec)); + CeedCall(CeedVectorGetLength(vec_copy, &length_copy)); + if (length_vec <= 0 || length_copy <= 0) return CEED_ERROR_SUCCESS; + length = length_vec < length_copy ? length_vec : length_copy; + } + CeedCheck(stop >= -1 && stop <= length, CeedVectorReturnCeed(vec), CEED_ERROR_ACCESS, + "Invalid value for stop %" CeedSize_FMT ", must be in the range [-1, length]", stop); + CeedCheck(start >= 0 && start <= length && (start <= stop || stop == -1), CeedVectorReturnCeed(vec), CEED_ERROR_ACCESS, + "Invalid value for start %" CeedSize_FMT ", must be in the range [0, stop]", start); + + // Backend version + if (vec->CopyStrided && vec_copy->CopyStrided) { + CeedCall(vec->CopyStrided(vec, start, stop, step, vec_copy)); + vec_copy->state += 2; + return CEED_ERROR_SUCCESS; + } + + // Copy + CeedCall(CeedVectorGetArrayRead(vec, CEED_MEM_HOST, &array)); + CeedCall(CeedVectorGetArray(vec_copy, CEED_MEM_HOST, &array_copy)); + if (stop == -1) stop = length; + for (CeedSize i = start; i < stop; i += step) array_copy[i] = array[i]; + + // Cleanup + CeedCall(CeedVectorRestoreArrayRead(vec, &array)); + CeedCall(CeedVectorRestoreArray(vec_copy, &array_copy)); + return CEED_ERROR_SUCCESS; +} + /** @brief Set the array used by a `CeedVector`, freeing any previously allocated array if applicable. @@ -255,13 +352,11 @@ int CeedVectorCopy(CeedVector vec, CeedVector vec_copy) { **/ int CeedVectorSetArray(CeedVector vec, CeedMemType mem_type, CeedCopyMode copy_mode, CeedScalar *array) { CeedSize length; - Ceed ceed; - - CeedCall(CeedVectorGetCeed(vec, &ceed)); - CeedCheck(vec->SetArray, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not support VectorSetArray"); - CeedCheck(vec->state % 2 == 0, ceed, CEED_ERROR_ACCESS, "Cannot grant CeedVector array access, the access lock is already in use"); - CeedCheck(vec->num_readers == 0, ceed, CEED_ERROR_ACCESS, "Cannot grant CeedVector array access, a process has read access"); + CeedCheck(vec->SetArray, CeedVectorReturnCeed(vec), CEED_ERROR_UNSUPPORTED, "Backend does not support VectorSetArray"); + CeedCheck(vec->state % 2 == 0, CeedVectorReturnCeed(vec), CEED_ERROR_ACCESS, + "Cannot grant CeedVector array access, the access lock is already in use"); + CeedCheck(vec->num_readers == 0, CeedVectorReturnCeed(vec), CEED_ERROR_ACCESS, "Cannot grant CeedVector array access, a process has read access"); CeedCall(CeedVectorGetLength(vec, &length)); if (length > 0) CeedCall(vec->SetArray(vec, mem_type, copy_mode, array)); @@ -280,14 +375,13 @@ int CeedVectorSetArray(CeedVector vec, CeedMemType mem_type, CeedCopyMode copy_m @ref User **/ int CeedVectorSetValue(CeedVector vec, CeedScalar value) { - Ceed ceed; - - CeedCall(CeedVectorGetCeed(vec, &ceed)); - CeedCheck(vec->state % 2 == 0, ceed, CEED_ERROR_ACCESS, "Cannot grant CeedVector array access, the access lock is already in use"); - CeedCheck(vec->num_readers == 0, ceed, CEED_ERROR_ACCESS, "Cannot grant CeedVector array access, a process has read access"); + CeedCheck(vec->state % 2 == 0, CeedVectorReturnCeed(vec), CEED_ERROR_ACCESS, + "Cannot grant CeedVector array access, the access lock is already in use"); + CeedCheck(vec->num_readers == 0, CeedVectorReturnCeed(vec), CEED_ERROR_ACCESS, "Cannot grant CeedVector array access, a process has read access"); if (vec->SetValue) { CeedCall(vec->SetValue(vec, value)); + vec->state += 2; } else { CeedSize length; CeedScalar *array; @@ -297,7 +391,46 @@ int CeedVectorSetValue(CeedVector vec, CeedScalar value) { for (CeedSize i = 0; i < length; i++) array[i] = value; CeedCall(CeedVectorRestoreArray(vec, &array)); } - vec->state += 2; + return CEED_ERROR_SUCCESS; +} + +/** + @brief Set a portion of a `CeedVector` to a constant value. + + Note: The `CeedVector` must already have valid data set via @ref CeedVectorSetArray() or similar. + + @param[in,out] vec `CeedVector` + @param[in] start First index to set in range `[start, stop)` + @param[in] stop One past the last element to set in the range, or `-1` for `length` + @param[in] step Stride between indices to set + @param[in] value Value to be used + + @return An error code: 0 - success, otherwise - failure + + @ref User +**/ +int CeedVectorSetValueStrided(CeedVector vec, CeedSize start, CeedSize stop, CeedSize step, CeedScalar value) { + CeedSize length; + + CeedCheck(vec->state % 2 == 0, CeedVectorReturnCeed(vec), CEED_ERROR_ACCESS, + "Cannot grant CeedVector array access, the access lock is already in use"); + CeedCheck(vec->num_readers == 0, CeedVectorReturnCeed(vec), CEED_ERROR_ACCESS, "Cannot grant CeedVector array access, a process has read access"); + CeedCall(CeedVectorGetLength(vec, &length)); + CeedCheck(stop >= -1 && stop <= length, CeedVectorReturnCeed(vec), CEED_ERROR_ACCESS, + "Invalid value for stop %" CeedSize_FMT ", must be in the range [-1, length]", stop); + + if (vec->SetValueStrided) { + CeedCall(vec->SetValueStrided(vec, start, stop, step, value)); + vec->state += 2; + } else { + CeedScalar *array; + + if (length <= 0) return CEED_ERROR_SUCCESS; + if (stop == -1) stop = length; + CeedCall(CeedVectorGetArray(vec, CEED_MEM_HOST, &array)); + for (CeedSize i = start; i < stop; i += step) array[i] = value; + CeedCall(CeedVectorRestoreArray(vec, &array)); + } return CEED_ERROR_SUCCESS; } @@ -352,22 +485,20 @@ int CeedVectorSyncArray(CeedVector vec, CeedMemType mem_type) { int CeedVectorTakeArray(CeedVector vec, CeedMemType mem_type, CeedScalar **array) { CeedSize length; CeedScalar *temp_array = NULL; - Ceed ceed; - CeedCall(CeedVectorGetCeed(vec, &ceed)); - CeedCheck(vec->state % 2 == 0, ceed, CEED_ERROR_ACCESS, "Cannot take CeedVector array, the access lock is already in use"); - CeedCheck(vec->num_readers == 0, ceed, CEED_ERROR_ACCESS, "Cannot take CeedVector array, a process has read access"); + CeedCheck(vec->state % 2 == 0, CeedVectorReturnCeed(vec), CEED_ERROR_ACCESS, "Cannot take CeedVector array, the access lock is already in use"); + CeedCheck(vec->num_readers == 0, CeedVectorReturnCeed(vec), CEED_ERROR_ACCESS, "Cannot take CeedVector array, a process has read access"); CeedCall(CeedVectorGetLength(vec, &length)); if (length > 0) { bool has_borrowed_array_of_type = true, has_valid_array = true; CeedCall(CeedVectorHasBorrowedArrayOfType(vec, mem_type, &has_borrowed_array_of_type)); - CeedCheck(has_borrowed_array_of_type, ceed, CEED_ERROR_BACKEND, "CeedVector has no borrowed %s array, must set array with CeedVectorSetArray", - CeedMemTypes[mem_type]); + CeedCheck(has_borrowed_array_of_type, CeedVectorReturnCeed(vec), CEED_ERROR_BACKEND, + "CeedVector has no borrowed %s array, must set array with CeedVectorSetArray", CeedMemTypes[mem_type]); CeedCall(CeedVectorHasValidArray(vec, &has_valid_array)); - CeedCheck(has_valid_array, ceed, CEED_ERROR_BACKEND, + CeedCheck(has_valid_array, CeedVectorReturnCeed(vec), CEED_ERROR_BACKEND, "CeedVector has no valid data to take, must set data with CeedVectorSetValue or CeedVectorSetArray"); CeedCall(vec->TakeArray(vec, mem_type, &temp_array)); @@ -395,19 +526,18 @@ int CeedVectorTakeArray(CeedVector vec, CeedMemType mem_type, CeedScalar **array **/ int CeedVectorGetArray(CeedVector vec, CeedMemType mem_type, CeedScalar **array) { CeedSize length; - Ceed ceed; - CeedCall(CeedVectorGetCeed(vec, &ceed)); - CeedCheck(vec->GetArray, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not support GetArray"); - CeedCheck(vec->state % 2 == 0, ceed, CEED_ERROR_ACCESS, "Cannot grant CeedVector array access, the access lock is already in use"); - CeedCheck(vec->num_readers == 0, ceed, CEED_ERROR_ACCESS, "Cannot grant CeedVector array access, a process has read access"); + CeedCheck(vec->GetArray, CeedVectorReturnCeed(vec), CEED_ERROR_UNSUPPORTED, "Backend does not support GetArray"); + CeedCheck(vec->state % 2 == 0, CeedVectorReturnCeed(vec), CEED_ERROR_ACCESS, + "Cannot grant CeedVector array access, the access lock is already in use"); + CeedCheck(vec->num_readers == 0, CeedVectorReturnCeed(vec), CEED_ERROR_ACCESS, "Cannot grant CeedVector array access, a process has read access"); CeedCall(CeedVectorGetLength(vec, &length)); if (length > 0) { bool has_valid_array = true; CeedCall(CeedVectorHasValidArray(vec, &has_valid_array)); - CeedCheck(has_valid_array, ceed, CEED_ERROR_BACKEND, + CeedCheck(has_valid_array, CeedVectorReturnCeed(vec), CEED_ERROR_BACKEND, "CeedVector has no valid data to read, must set data with CeedVectorSetValue or CeedVectorSetArray"); CeedCall(vec->GetArray(vec, mem_type, array)); @@ -434,18 +564,17 @@ int CeedVectorGetArray(CeedVector vec, CeedMemType mem_type, CeedScalar **array) **/ int CeedVectorGetArrayRead(CeedVector vec, CeedMemType mem_type, const CeedScalar **array) { CeedSize length; - Ceed ceed; - CeedCall(CeedVectorGetCeed(vec, &ceed)); - CeedCheck(vec->GetArrayRead, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not support GetArrayRead"); - CeedCheck(vec->state % 2 == 0, ceed, CEED_ERROR_ACCESS, "Cannot grant CeedVector read-only array access, the access lock is already in use"); + CeedCheck(vec->GetArrayRead, CeedVectorReturnCeed(vec), CEED_ERROR_UNSUPPORTED, "Backend does not support GetArrayRead"); + CeedCheck(vec->state % 2 == 0, CeedVectorReturnCeed(vec), CEED_ERROR_ACCESS, + "Cannot grant CeedVector read-only array access, the access lock is already in use"); CeedCall(CeedVectorGetLength(vec, &length)); if (length > 0) { bool has_valid_array = true; CeedCall(CeedVectorHasValidArray(vec, &has_valid_array)); - CeedCheck(has_valid_array, ceed, CEED_ERROR_BACKEND, + CeedCheck(has_valid_array, CeedVectorReturnCeed(vec), CEED_ERROR_BACKEND, "CeedVector has no valid data to read, must set data with CeedVectorSetValue or CeedVectorSetArray"); CeedCall(vec->GetArrayRead(vec, mem_type, array)); @@ -472,12 +601,11 @@ int CeedVectorGetArrayRead(CeedVector vec, CeedMemType mem_type, const CeedScala **/ int CeedVectorGetArrayWrite(CeedVector vec, CeedMemType mem_type, CeedScalar **array) { CeedSize length; - Ceed ceed; - CeedCall(CeedVectorGetCeed(vec, &ceed)); - CeedCheck(vec->GetArrayWrite, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not support CeedVectorGetArrayWrite"); - CeedCheck(vec->state % 2 == 0, ceed, CEED_ERROR_ACCESS, "Cannot grant CeedVector array access, the access lock is already in use"); - CeedCheck(vec->num_readers == 0, ceed, CEED_ERROR_ACCESS, "Cannot grant CeedVector array access, a process has read access"); + CeedCheck(vec->GetArrayWrite, CeedVectorReturnCeed(vec), CEED_ERROR_UNSUPPORTED, "Backend does not support CeedVectorGetArrayWrite"); + CeedCheck(vec->state % 2 == 0, CeedVectorReturnCeed(vec), CEED_ERROR_ACCESS, + "Cannot grant CeedVector array access, the access lock is already in use"); + CeedCheck(vec->num_readers == 0, CeedVectorReturnCeed(vec), CEED_ERROR_ACCESS, "Cannot grant CeedVector array access, a process has read access"); CeedCall(CeedVectorGetLength(vec, &length)); if (length > 0) { @@ -644,24 +772,36 @@ int CeedVectorAXPY(CeedVector y, CeedScalar alpha, CeedVector x) { CeedSize length_x, length_y; CeedScalar *y_array = NULL; CeedScalar const *x_array = NULL; - Ceed ceed, ceed_parent_x, ceed_parent_y; - CeedCall(CeedVectorGetCeed(y, &ceed)); CeedCall(CeedVectorGetLength(y, &length_y)); CeedCall(CeedVectorGetLength(x, &length_x)); - CeedCheck(length_x == length_y, ceed, CEED_ERROR_UNSUPPORTED, "Cannot add vector of different lengths"); - CeedCheck(x != y, ceed, CEED_ERROR_UNSUPPORTED, "Cannot use same vector for x and y in CeedVectorAXPY"); + CeedCheck(length_x == length_y, CeedVectorReturnCeed(y), CEED_ERROR_UNSUPPORTED, + "Cannot add vector of different lengths." + " x length: %" CeedSize_FMT " y length: %" CeedSize_FMT, + length_x, length_y); + CeedCheck(x != y, CeedVectorReturnCeed(y), CEED_ERROR_UNSUPPORTED, "Cannot use same vector for x and y in CeedVectorAXPY"); CeedCall(CeedVectorHasValidArray(x, &has_valid_array_x)); - CeedCheck(has_valid_array_x, ceed, CEED_ERROR_BACKEND, + CeedCheck(has_valid_array_x, CeedVectorReturnCeed(y), CEED_ERROR_BACKEND, "CeedVector x has no valid data, must set data with CeedVectorSetValue or CeedVectorSetArray"); CeedCall(CeedVectorHasValidArray(y, &has_valid_array_y)); - CeedCheck(has_valid_array_y, ceed, CEED_ERROR_BACKEND, + CeedCheck(has_valid_array_y, CeedVectorReturnCeed(y), CEED_ERROR_BACKEND, "CeedVector y has no valid data, must set data with CeedVectorSetValue or CeedVectorSetArray"); - CeedCall(CeedGetParent(x->ceed, &ceed_parent_x)); - CeedCall(CeedGetParent(y->ceed, &ceed_parent_y)); - CeedCheck(ceed_parent_x == ceed_parent_y, ceed, CEED_ERROR_INCOMPATIBLE, "Vectors x and y must be created by the same Ceed context"); + { + Ceed ceed_x, ceed_y, ceed_parent_x, ceed_parent_y; + + CeedCall(CeedVectorGetCeed(y, &ceed_y)); + CeedCall(CeedVectorGetCeed(x, &ceed_x)); + CeedCall(CeedGetParent(ceed_x, &ceed_parent_x)); + CeedCall(CeedGetParent(ceed_y, &ceed_parent_y)); + CeedCall(CeedDestroy(&ceed_x)); + CeedCall(CeedDestroy(&ceed_y)); + CeedCheck(ceed_parent_x == ceed_parent_y, CeedVectorReturnCeed(y), CEED_ERROR_INCOMPATIBLE, + "Vectors x and y must be created by the same Ceed context"); + CeedCall(CeedDestroy(&ceed_parent_x)); + CeedCall(CeedDestroy(&ceed_parent_y)); + } // Return early for empty vectors if (length_y == 0) return CEED_ERROR_SUCCESS; @@ -703,25 +843,36 @@ int CeedVectorAXPBY(CeedVector y, CeedScalar alpha, CeedScalar beta, CeedVector CeedSize length_x, length_y; CeedScalar *y_array = NULL; CeedScalar const *x_array = NULL; - Ceed ceed, ceed_parent_x, ceed_parent_y; - - CeedCall(CeedVectorGetCeed(y, &ceed)); CeedCall(CeedVectorGetLength(y, &length_y)); CeedCall(CeedVectorGetLength(x, &length_x)); - CeedCheck(length_x == length_y, ceed, CEED_ERROR_UNSUPPORTED, "Cannot add vector of different lengths"); - CeedCheck(x != y, ceed, CEED_ERROR_UNSUPPORTED, "Cannot use same vector for x and y in CeedVectorAXPBY"); + CeedCheck(length_x == length_y, CeedVectorReturnCeed(y), CEED_ERROR_UNSUPPORTED, + "Cannot add vector of different lengths." + " x length: %" CeedSize_FMT " y length: %" CeedSize_FMT, + length_x, length_y); + CeedCheck(x != y, CeedVectorReturnCeed(y), CEED_ERROR_UNSUPPORTED, "Cannot use same vector for x and y in CeedVectorAXPBY"); CeedCall(CeedVectorHasValidArray(x, &has_valid_array_x)); - CeedCheck(has_valid_array_x, ceed, CEED_ERROR_BACKEND, + CeedCheck(has_valid_array_x, CeedVectorReturnCeed(y), CEED_ERROR_BACKEND, "CeedVector x has no valid data, must set data with CeedVectorSetValue or CeedVectorSetArray"); CeedCall(CeedVectorHasValidArray(y, &has_valid_array_y)); - CeedCheck(has_valid_array_y, ceed, CEED_ERROR_BACKEND, + CeedCheck(has_valid_array_y, CeedVectorReturnCeed(y), CEED_ERROR_BACKEND, "CeedVector y has no valid data, must set data with CeedVectorSetValue or CeedVectorSetArray"); - CeedCall(CeedGetParent(x->ceed, &ceed_parent_x)); - CeedCall(CeedGetParent(y->ceed, &ceed_parent_y)); - CeedCheck(ceed_parent_x == ceed_parent_y, ceed, CEED_ERROR_INCOMPATIBLE, "Vectors x and y must be created by the same Ceed context"); + { + Ceed ceed_x, ceed_y, ceed_parent_x, ceed_parent_y; + + CeedCall(CeedVectorGetCeed(y, &ceed_y)); + CeedCall(CeedVectorGetCeed(x, &ceed_x)); + CeedCall(CeedGetParent(ceed_x, &ceed_parent_x)); + CeedCall(CeedGetParent(ceed_y, &ceed_parent_y)); + CeedCall(CeedDestroy(&ceed_x)); + CeedCall(CeedDestroy(&ceed_y)); + CeedCheck(ceed_parent_x == ceed_parent_y, CeedVectorReturnCeed(y), CEED_ERROR_INCOMPATIBLE, + "Vectors x and y must be created by the same Ceed context"); + CeedCall(CeedDestroy(&ceed_parent_x)); + CeedCall(CeedDestroy(&ceed_parent_y)); + } // Return early for empty vectors if (length_y == 0) return CEED_ERROR_SUCCESS; @@ -764,25 +915,39 @@ int CeedVectorPointwiseMult(CeedVector w, CeedVector x, CeedVector y) { CeedScalar *w_array = NULL; CeedScalar const *x_array = NULL, *y_array = NULL; CeedSize length_w, length_x, length_y; - Ceed ceed, ceed_parent_w, ceed_parent_x, ceed_parent_y; - CeedCall(CeedVectorGetCeed(w, &ceed)); CeedCall(CeedVectorGetLength(w, &length_w)); CeedCall(CeedVectorGetLength(x, &length_x)); CeedCall(CeedVectorGetLength(y, &length_y)); - CeedCheck(length_w == length_x && length_w == length_y, ceed, CEED_ERROR_UNSUPPORTED, "Cannot multiply vectors of different lengths"); - - CeedCall(CeedGetParent(w->ceed, &ceed_parent_w)); - CeedCall(CeedGetParent(x->ceed, &ceed_parent_x)); - CeedCall(CeedGetParent(y->ceed, &ceed_parent_y)); - CeedCheck(ceed_parent_w == ceed_parent_x && ceed_parent_w == ceed_parent_y, ceed, CEED_ERROR_INCOMPATIBLE, - "Vectors w, x, and y must be created by the same Ceed context"); + CeedCheck(length_x >= length_w && length_y >= length_w, CeedVectorReturnCeed(w), CEED_ERROR_UNSUPPORTED, + "Cannot pointwise multiply vectors of incompatible lengths." + " w length: %" CeedSize_FMT " x length: %" CeedSize_FMT " y length: %" CeedSize_FMT, + length_w, length_x, length_y); + + { + Ceed ceed_w, ceed_x, ceed_y, ceed_parent_w, ceed_parent_x, ceed_parent_y; + + CeedCall(CeedVectorGetCeed(w, &ceed_w)); + CeedCall(CeedVectorGetCeed(x, &ceed_x)); + CeedCall(CeedVectorGetCeed(y, &ceed_y)); + CeedCall(CeedGetParent(ceed_w, &ceed_parent_w)); + CeedCall(CeedGetParent(ceed_x, &ceed_parent_x)); + CeedCall(CeedGetParent(ceed_y, &ceed_parent_y)); + CeedCall(CeedDestroy(&ceed_w)); + CeedCall(CeedDestroy(&ceed_x)); + CeedCall(CeedDestroy(&ceed_y)); + CeedCheck(ceed_parent_w == ceed_parent_x && ceed_parent_w == ceed_parent_y, CeedVectorReturnCeed(w), CEED_ERROR_INCOMPATIBLE, + "Vectors w, x, and y must be created by the same Ceed context"); + CeedCall(CeedDestroy(&ceed_parent_w)); + CeedCall(CeedDestroy(&ceed_parent_x)); + CeedCall(CeedDestroy(&ceed_parent_y)); + } CeedCall(CeedVectorHasValidArray(x, &has_valid_array_x)); - CeedCheck(has_valid_array_x, ceed, CEED_ERROR_BACKEND, + CeedCheck(has_valid_array_x, CeedVectorReturnCeed(w), CEED_ERROR_BACKEND, "CeedVector x has no valid data, must set data with CeedVectorSetValue or CeedVectorSetArray"); CeedCall(CeedVectorHasValidArray(y, &has_valid_array_y)); - CeedCheck(has_valid_array_y, ceed, CEED_ERROR_BACKEND, + CeedCheck(has_valid_array_y, CeedVectorReturnCeed(w), CEED_ERROR_BACKEND, "CeedVector y has no valid data, must set data with CeedVectorSetValue or CeedVectorSetArray"); // Return early for empty vectors @@ -838,15 +1003,13 @@ int CeedVectorReciprocal(CeedVector vec) { bool has_valid_array = true; CeedSize length; CeedScalar *array; - Ceed ceed; - CeedCall(CeedVectorGetCeed(vec, &ceed)); CeedCall(CeedVectorHasValidArray(vec, &has_valid_array)); - CeedCheck(has_valid_array, ceed, CEED_ERROR_BACKEND, + CeedCheck(has_valid_array, CeedVectorReturnCeed(vec), CEED_ERROR_BACKEND, "CeedVector has no valid data to compute reciprocal, must set data with CeedVectorSetValue or CeedVectorSetArray"); // Check if vector data set - CeedCheck(vec->state > 0, ceed, CEED_ERROR_INCOMPLETE, "CeedVector must have data set to take reciprocal"); + CeedCheck(vec->state > 0, CeedVectorReturnCeed(vec), CEED_ERROR_INCOMPLETE, "CeedVector must have data set to take reciprocal"); // Return early for empty vector CeedCall(CeedVectorGetLength(vec, &length)); @@ -867,6 +1030,36 @@ int CeedVectorReciprocal(CeedVector vec) { return CEED_ERROR_SUCCESS; } +/** + @brief Set the number of tabs to indent for @ref CeedVectorView() output + + @param[in] vec `CeedVector` to set the number of view tabs + @param[in] num_tabs Number of view tabs to set + + @return Error code: 0 - success, otherwise - failure + + @ref User +**/ +int CeedVectorSetNumViewTabs(CeedVector vec, CeedInt num_tabs) { + CeedCall(CeedObjectSetNumViewTabs((CeedObject)vec, num_tabs)); + return CEED_ERROR_SUCCESS; +} + +/** + @brief Get the number of tabs to indent for @ref CeedVectorView() output + + @param[in] vec `CeedVector` to get the number of view tabs + @param[out] num_tabs Number of view tabs + + @return Error code: 0 - success, otherwise - failure + + @ref User +**/ +int CeedVectorGetNumViewTabs(CeedVector vec, CeedInt *num_tabs) { + CeedCall(CeedObjectGetNumViewTabs((CeedObject)vec, num_tabs)); + return CEED_ERROR_SUCCESS; +} + /** @brief View a `CeedVector` @@ -874,8 +1067,8 @@ int CeedVectorReciprocal(CeedVector vec) { Any portion of the provided range that is outside the range of valid indices for the `CeedVector` will be ignored. @param[in] vec `CeedVector` to view - @param[in] start Index of first `CeedVector` entry to view - @param[in] stop Index of last `CeedVector` entry to view + @param[in] start Index of first `CeedVector` entry to view in the range `[start, stop)` + @param[in] stop One past the last element to view in the range, or `-1` for `length` @param[in] step Step between `CeedVector` entries to view @param[in] fp_fmt Printing format @param[in] stream Filestream to write to @@ -886,24 +1079,34 @@ int CeedVectorReciprocal(CeedVector vec) { **/ int CeedVectorViewRange(CeedVector vec, CeedSize start, CeedSize stop, CeedInt step, const char *fp_fmt, FILE *stream) { char fmt[1024]; + char *tabs = NULL; CeedSize length; const CeedScalar *x; CeedCheck(step != 0, CeedVectorReturnCeed(vec), CEED_ERROR_MINOR, "View range 'step' must be nonzero"); + { + CeedInt num_tabs = 0; + + CeedCall(CeedVectorGetNumViewTabs(vec, &num_tabs)); + CeedCall(CeedCalloc(CEED_TAB_WIDTH * num_tabs + 1, &tabs)); + for (CeedInt i = 0; i < CEED_TAB_WIDTH * num_tabs; i++) tabs[i] = ' '; + } + CeedCall(CeedVectorGetLength(vec, &length)); - fprintf(stream, "CeedVector length %" CeedSize_FMT "\n", length); + fprintf(stream, "%sCeedVector length %" CeedSize_FMT "\n", tabs, length); if (start != 0 || stop != length || step != 1) { - fprintf(stream, " start: %" CeedSize_FMT "\n stop: %" CeedSize_FMT "\n step: %" CeedInt_FMT "\n", start, stop, step); + fprintf(stream, "%s start: %" CeedSize_FMT "\n%s stop: %" CeedSize_FMT "\n%s step: %" CeedInt_FMT "\n", tabs, start, tabs, stop, tabs, step); } if (start > length) start = length; - if (stop > length) stop = length; + if (stop == -1 || stop > length) stop = length; - snprintf(fmt, sizeof fmt, " %s\n", fp_fmt ? fp_fmt : "%g"); + snprintf(fmt, sizeof fmt, "%s %s\n", tabs, fp_fmt ? fp_fmt : "%g"); CeedCall(CeedVectorGetArrayRead(vec, CEED_MEM_HOST, &x)); for (CeedSize i = start; step > 0 ? (i < stop) : (i > stop); i += step) fprintf(stream, fmt, x[i]); CeedCall(CeedVectorRestoreArrayRead(vec, &x)); - if (stop != length) fprintf(stream, " ...\n"); + if (stop != length) fprintf(stream, "%s ...\n", tabs); + CeedCall(CeedFree(&tabs)); return CEED_ERROR_SUCCESS; } @@ -937,7 +1140,7 @@ int CeedVectorView(CeedVector vec, const char *fp_fmt, FILE *stream) { @ref Advanced **/ int CeedVectorGetCeed(CeedVector vec, Ceed *ceed) { - *ceed = CeedVectorReturnCeed(vec); + CeedCall(CeedObjectGetCeed((CeedObject)vec, ceed)); return CEED_ERROR_SUCCESS; } @@ -950,7 +1153,7 @@ int CeedVectorGetCeed(CeedVector vec, Ceed *ceed) { @ref Advanced **/ -Ceed CeedVectorReturnCeed(CeedVector vec) { return vec->ceed; } +Ceed CeedVectorReturnCeed(CeedVector vec) { return CeedObjectReturnCeed((CeedObject)vec); } /** @brief Get the length of a `CeedVector` @@ -977,16 +1180,15 @@ int CeedVectorGetLength(CeedVector vec, CeedSize *length) { @ref User **/ int CeedVectorDestroy(CeedVector *vec) { - if (!*vec || *vec == CEED_VECTOR_ACTIVE || *vec == CEED_VECTOR_NONE || --(*vec)->ref_count > 0) { + if (!*vec || *vec == CEED_VECTOR_ACTIVE || *vec == CEED_VECTOR_NONE || CeedObjectDereference((CeedObject)*vec) > 0) { *vec = NULL; return CEED_ERROR_SUCCESS; } - CeedCheck((*vec)->state % 2 == 0, (*vec)->ceed, CEED_ERROR_ACCESS, "Cannot destroy CeedVector, the writable access lock is in use"); - CeedCheck((*vec)->num_readers == 0, (*vec)->ceed, CEED_ERROR_ACCESS, "Cannot destroy CeedVector, a process has read access"); + CeedCheck((*vec)->state % 2 == 0, CeedVectorReturnCeed(*vec), CEED_ERROR_ACCESS, "Cannot destroy CeedVector, the writable access lock is in use"); + CeedCheck((*vec)->num_readers == 0, CeedVectorReturnCeed(*vec), CEED_ERROR_ACCESS, "Cannot destroy CeedVector, a process has read access"); if ((*vec)->Destroy) CeedCall((*vec)->Destroy(*vec)); - - CeedCall(CeedDestroy(&(*vec)->ceed)); + CeedCall(CeedObjectDestroy_Private(&(*vec)->obj)); CeedCall(CeedFree(vec)); return CEED_ERROR_SUCCESS; } diff --git a/interface/ceed.c b/interface/ceed.c index ad7f09fa8e..6c94ec8db8 100644 --- a/interface/ceed.c +++ b/interface/ceed.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -28,8 +28,7 @@ static struct { } backends[32]; static size_t num_backends; -#define CEED_FTABLE_ENTRY(class, method) \ - { #class #method, offsetof(struct class##_private, method) } +#define CEED_FTABLE_ENTRY(class, method) {#class #method, offsetof(struct class##_private, method)} /// @endcond /// @file @@ -139,6 +138,75 @@ int CeedRegisterImpl(const char *prefix, int (*init)(const char *, Ceed), unsign return CEED_ERROR_SUCCESS; } +/** + @brief Create a work vector space for a `ceed` + + @param[in,out] ceed `Ceed` to create work vector space for + + @return An error code: 0 - success, otherwise - failure + + @ref Developer +**/ +static int CeedWorkVectorsCreate(Ceed ceed) { + CeedCall(CeedCalloc(1, &ceed->work_vectors)); + return CEED_ERROR_SUCCESS; +} + +/** + @brief Destroy a work vector space for a `ceed` + + @param[in,out] ceed `Ceed` to destroy work vector space for + + @return An error code: 0 - success, otherwise - failure + + @ref Developer +**/ +static int CeedWorkVectorsDestroy(Ceed ceed) { + if (!ceed->work_vectors) return CEED_ERROR_SUCCESS; + for (CeedSize i = 0; i < ceed->work_vectors->num_vecs; i++) { + CeedCheck(!ceed->work_vectors->is_in_use[i], ceed, CEED_ERROR_ACCESS, "Work vector %" CeedSize_FMT " checked out but not returned"); + // Note: increase ref_count to prevent Ceed destructor from triggering again + CeedCall(CeedObjectReference((CeedObject)ceed)); + CeedCall(CeedObjectReference((CeedObject)ceed)); + CeedCall(CeedVectorDestroy(&ceed->work_vectors->vecs[i])); + // Note: restore ref_count + CeedObjectDereference((CeedObject)ceed); + } + CeedCall(CeedFree(&ceed->work_vectors->is_in_use)); + CeedCall(CeedFree(&ceed->work_vectors->vecs)); + CeedCall(CeedFree(&ceed->work_vectors)); + return CEED_ERROR_SUCCESS; +} + +/** + @brief View a `Ceed` passed as a `CeedObject` + + @param[in] ceed `Ceed` to view + @param[in] stream Filestream to write to + + @return An error code: 0 - success, otherwise - failure + + @ref Developer +**/ +static int CeedView_Object(CeedObject ceed, FILE *stream) { + CeedCall(CeedView((Ceed)ceed, stream)); + return CEED_ERROR_SUCCESS; +} + +/** + @brief Destroy a `Ceed` passed as a `CeedObject` + + @param[in,out] ceed Address of `Ceed` context to destroy + + @return An error code: 0 - success, otherwise - failure + + @ref Developer +**/ +static int CeedDestroy_Object(CeedObject *ceed) { + CeedCall(CeedDestroy((Ceed *)ceed)); + return CEED_ERROR_SUCCESS; +} + /// @} /// ---------------------------------------------------------------------------- @@ -313,10 +381,15 @@ static inline int CeedSetHostGenericArray(const void *source_array, CeedCopyMode void *target_array_owned, void *target_array_borrowed, void *target_array) { switch (copy_mode) { case CEED_COPY_VALUES: - if (!*(void **)target_array_owned) CeedCall(CeedCallocArray(num_values, size_unit, target_array_owned)); - if (source_array) memcpy(*(void **)target_array_owned, source_array, size_unit * num_values); - *(void **)target_array_borrowed = NULL; - *(void **)target_array = *(void **)target_array_owned; + if (!*(void **)target_array) { + if (*(void **)target_array_borrowed) { + *(void **)target_array = *(void **)target_array_borrowed; + } else { + if (!*(void **)target_array_owned) CeedCall(CeedCallocArray(num_values, size_unit, target_array_owned)); + *(void **)target_array = *(void **)target_array_owned; + } + } + if (source_array) memcpy(*(void **)target_array, source_array, size_unit * num_values); break; case CEED_OWN_POINTER: CeedCall(CeedFree(target_array_owned)); @@ -444,7 +517,9 @@ int CeedIsDebug(Ceed ceed, bool *is_debug) { } /** - @brief Get the root of the requested resource + @brief Get the root of the requested resource. + + Note: Caller is responsible for calling @ref CeedFree() on the `resource_root`. @param[in] ceed `Ceed` context to get resource name of @param[in] resource Full user specified resource @@ -479,7 +554,8 @@ int CeedGetParent(Ceed ceed, Ceed *parent) { CeedCall(CeedGetParent(ceed->parent, parent)); return CEED_ERROR_SUCCESS; } - *parent = ceed; + *parent = NULL; + CeedCall(CeedReferenceCopy(ceed, parent)); return CEED_ERROR_SUCCESS; } @@ -494,7 +570,8 @@ int CeedGetParent(Ceed ceed, Ceed *parent) { @ref Backend **/ int CeedGetDelegate(Ceed ceed, Ceed *delegate) { - *delegate = ceed->delegate; + *delegate = NULL; + if (ceed->delegate) CeedCall(CeedReferenceCopy(ceed->delegate, delegate)); return CEED_ERROR_SUCCESS; } @@ -512,7 +589,7 @@ int CeedGetDelegate(Ceed ceed, Ceed *delegate) { @ref Backend **/ int CeedSetDelegate(Ceed ceed, Ceed delegate) { - ceed->delegate = delegate; + CeedCall(CeedReferenceCopy(delegate, &ceed->delegate)); delegate->parent = ceed; return CEED_ERROR_SUCCESS; } @@ -532,7 +609,8 @@ int CeedGetObjectDelegate(Ceed ceed, Ceed *delegate, const char *obj_name) { // Check for object delegate for (CeedInt i = 0; i < ceed->obj_delegate_count; i++) { if (!strcmp(obj_name, ceed->obj_delegates->obj_name)) { - *delegate = ceed->obj_delegates->delegate; + *delegate = NULL; + CeedCall(CeedReferenceCopy(ceed->obj_delegates->delegate, delegate)); return CEED_ERROR_SUCCESS; } } @@ -569,7 +647,7 @@ int CeedSetObjectDelegate(Ceed ceed, Ceed delegate, const char *obj_name) { ceed->obj_delegate_count++; // Set object delegate - ceed->obj_delegates[count].delegate = delegate; + CeedCall(CeedReferenceCopy(delegate, &ceed->obj_delegates[count].delegate)); CeedCall(CeedStringAllocCopy(obj_name, &ceed->obj_delegates[count].obj_name)); // Set delegate parent @@ -577,21 +655,6 @@ int CeedSetObjectDelegate(Ceed ceed, Ceed delegate, const char *obj_name) { return CEED_ERROR_SUCCESS; } -/** - @brief Get the fallback resource for `CeedOperator` - - @param[in] ceed `Ceed` context - @param[out] resource Variable to store fallback resource - - @return An error code: 0 - success, otherwise - failure - - @ref Backend -**/ -int CeedGetOperatorFallbackResource(Ceed ceed, const char **resource) { - *resource = (const char *)ceed->op_fallback_resource; - return CEED_ERROR_SUCCESS; -} - /** @brief Get the fallback `Ceed` for `CeedOperator` @@ -603,50 +666,32 @@ int CeedGetOperatorFallbackResource(Ceed ceed, const char **resource) { @ref Backend **/ int CeedGetOperatorFallbackCeed(Ceed ceed, Ceed *fallback_ceed) { - if (ceed->has_valid_op_fallback_resource) { - CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "---------- CeedOperator Fallback ----------\n"); - CeedDebug(ceed, "Getting fallback from %s to %s\n", ceed->resource, ceed->op_fallback_resource); + if (ceed->op_fallback_ceed) { + CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "---------- Ceed Fallback ----------\n"); + CeedDebug(ceed, "Falling back from Ceed with backend %s at address %p to Ceed with backend %s at address %p", ceed->resource, ceed, + ceed->op_fallback_ceed->resource, ceed->op_fallback_ceed); } - // Create fallback Ceed if uninitalized - if (!ceed->op_fallback_ceed && ceed->has_valid_op_fallback_resource) { - CeedDebug(ceed, "Creating fallback Ceed"); - - Ceed fallback_ceed; - const char *fallback_resource; - - CeedCall(CeedGetOperatorFallbackResource(ceed, &fallback_resource)); - CeedCall(CeedInit(fallback_resource, &fallback_ceed)); - fallback_ceed->op_fallback_parent = ceed; - fallback_ceed->Error = ceed->Error; - ceed->op_fallback_ceed = fallback_ceed; - } - *fallback_ceed = ceed->op_fallback_ceed; + *fallback_ceed = NULL; + if (ceed->op_fallback_ceed) CeedCall(CeedReferenceCopy(ceed->op_fallback_ceed, fallback_ceed)); return CEED_ERROR_SUCCESS; } /** @brief Set the fallback resource for `CeedOperator`. - The current resource, if any, is freed by calling this function. - This string is freed upon the destruction of the `Ceed` context. + The current fallback, if any, is freed by calling this function. - @param[in,out] ceed `Ceed` context - @param[in] resource Fallback resource to set + @param[in,out] ceed `Ceed` context + @param[in] fallback_ceed `Ceed` context to create fallback operators @return An error code: 0 - success, otherwise - failure @ref Backend **/ -int CeedSetOperatorFallbackResource(Ceed ceed, const char *resource) { - // Free old - CeedCall(CeedFree(&ceed->op_fallback_resource)); - - // Set new - CeedCall(CeedStringAllocCopy(resource, (char **)&ceed->op_fallback_resource)); - - // Check validity - ceed->has_valid_op_fallback_resource = ceed->op_fallback_resource && ceed->resource && strcmp(ceed->op_fallback_resource, ceed->resource); +int CeedSetOperatorFallbackCeed(Ceed ceed, Ceed fallback_ceed) { + CeedCall(CeedReferenceCopy(fallback_ceed, &ceed->op_fallback_ceed)); + fallback_ceed->parent = ceed; return CEED_ERROR_SUCCESS; } @@ -746,7 +791,317 @@ int CeedSetData(Ceed ceed, void *data) { @ref Backend **/ int CeedReference(Ceed ceed) { - ceed->ref_count++; + CeedCall(CeedObjectReference((CeedObject)ceed)); + return CEED_ERROR_SUCCESS; +} + +/** + @brief Computes the current memory usage of the work vectors in a `Ceed` context and prints to debug.abort + + @param[in] ceed `Ceed` context + @param[out] usage_mb Address of the variable where the MB of work vector usage will be stored + + @return An error code: 0 - success, otherwise - failure + + @ref Developer +**/ +int CeedGetWorkVectorMemoryUsage(Ceed ceed, CeedScalar *usage_mb) { + if (!ceed->VectorCreate) { + Ceed delegate; + + CeedCall(CeedGetObjectDelegate(ceed, &delegate, "Vector")); + CeedCheck(delegate, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement VectorCreate"); + CeedCall(CeedGetWorkVectorMemoryUsage(delegate, usage_mb)); + CeedCall(CeedDestroy(&delegate)); + return CEED_ERROR_SUCCESS; + } + *usage_mb = 0.0; + if (ceed->work_vectors) { + for (CeedInt i = 0; i < ceed->work_vectors->num_vecs; i++) { + CeedSize vec_len; + CeedCall(CeedVectorGetLength(ceed->work_vectors->vecs[i], &vec_len)); + *usage_mb += vec_len; + } + *usage_mb *= sizeof(CeedScalar) * 1e-6; + CeedDebug(ceed, "Resource {%s}: Work vectors memory usage: %" CeedInt_FMT " vectors, %g MB\n", ceed->resource, ceed->work_vectors->num_vecs, + *usage_mb); + } + return CEED_ERROR_SUCCESS; +} + +/** + @brief Clear inactive work vectors in a `Ceed` context below a minimum length. + + @param[in,out] ceed `Ceed` context + @param[in] min_len Minimum length of work vector to keep + + @return An error code: 0 - success, otherwise - failure + + @ref Backend +**/ +int CeedClearWorkVectors(Ceed ceed, CeedSize min_len) { + if (!ceed->VectorCreate) { + Ceed delegate; + + CeedCall(CeedGetObjectDelegate(ceed, &delegate, "Vector")); + CeedCheck(delegate, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement VectorCreate"); + CeedCall(CeedClearWorkVectors(delegate, min_len)); + CeedCall(CeedDestroy(&delegate)); + return CEED_ERROR_SUCCESS; + } + if (!ceed->work_vectors) return CEED_ERROR_SUCCESS; + for (CeedInt i = 0; i < ceed->work_vectors->num_vecs; i++) { + if (ceed->work_vectors->is_in_use[i]) continue; + CeedSize vec_len; + CeedCall(CeedVectorGetLength(ceed->work_vectors->vecs[i], &vec_len)); + if (vec_len < min_len) { + // Note: increase ref_count to prevent Ceed destructor from triggering + CeedCall(CeedObjectReference((CeedObject)ceed)); + CeedCall(CeedObjectReference((CeedObject)ceed)); + CeedCall(CeedVectorDestroy(&ceed->work_vectors->vecs[i])); + // Note: restore ref_count + CeedObjectDereference((CeedObject)ceed); + ceed->work_vectors->num_vecs--; + if (ceed->work_vectors->num_vecs > 0) { + ceed->work_vectors->vecs[i] = ceed->work_vectors->vecs[ceed->work_vectors->num_vecs]; + ceed->work_vectors->is_in_use[i] = ceed->work_vectors->is_in_use[ceed->work_vectors->num_vecs]; + ceed->work_vectors->is_in_use[ceed->work_vectors->num_vecs] = false; + i--; + } + } + } + return CEED_ERROR_SUCCESS; +} + +/** + @brief Get a `CeedVector` for scratch work from a `Ceed` context. + + Note: This vector must be restored with @ref CeedRestoreWorkVector(). + + @param[in] ceed `Ceed` context + @param[in] len Minimum length of work vector + @param[out] vec Address of the variable where `CeedVector` will be stored + + @return An error code: 0 - success, otherwise - failure + + @ref Backend +**/ +int CeedGetWorkVector(Ceed ceed, CeedSize len, CeedVector *vec) { + CeedInt i = 0; + CeedScalar usage_mb; + + if (!ceed->VectorCreate) { + Ceed delegate; + + CeedCall(CeedGetObjectDelegate(ceed, &delegate, "Vector")); + CeedCheck(delegate, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement VectorCreate"); + CeedCall(CeedGetWorkVector(delegate, len, vec)); + CeedCall(CeedDestroy(&delegate)); + return CEED_ERROR_SUCCESS; + } + + if (!ceed->work_vectors) CeedCall(CeedWorkVectorsCreate(ceed)); + + // Search for big enough work vector + for (i = 0; i < ceed->work_vectors->num_vecs; i++) { + if (!ceed->work_vectors->is_in_use[i]) { + CeedSize work_len; + + CeedCall(CeedVectorGetLength(ceed->work_vectors->vecs[i], &work_len)); + if (work_len >= len) break; + } + } + // Long enough vector was not found + if (i == ceed->work_vectors->num_vecs) { + if (ceed->work_vectors->max_vecs == 0) { + ceed->work_vectors->max_vecs = 1; + CeedCall(CeedCalloc(ceed->work_vectors->max_vecs, &ceed->work_vectors->vecs)); + CeedCall(CeedCalloc(ceed->work_vectors->max_vecs, &ceed->work_vectors->is_in_use)); + } else if (ceed->work_vectors->max_vecs == i) { + ceed->work_vectors->max_vecs *= 2; + CeedCall(CeedRealloc(ceed->work_vectors->max_vecs, &ceed->work_vectors->vecs)); + CeedCall(CeedRealloc(ceed->work_vectors->max_vecs, &ceed->work_vectors->is_in_use)); + } + ceed->work_vectors->num_vecs++; + CeedCallBackend(CeedVectorCreate(ceed, len, &ceed->work_vectors->vecs[i])); + // Note: ref_count manipulation to prevent a ref-loop + CeedObjectDereference((CeedObject)ceed); + if (ceed->is_debug) CeedGetWorkVectorMemoryUsage(ceed, &usage_mb); + } + // Return pointer to work vector + ceed->work_vectors->is_in_use[i] = true; + *vec = NULL; + CeedCall(CeedVectorReferenceCopy(ceed->work_vectors->vecs[i], vec)); + // Note: bump ref_count to account for external access + CeedCall(CeedObjectReference((CeedObject)ceed)); + return CEED_ERROR_SUCCESS; +} + +/** + @brief Restore a `CeedVector` for scratch work from a `Ceed` context from @ref CeedGetWorkVector() + + @param[in] ceed `Ceed` context + @param[out] vec `CeedVector` to restore + + @return An error code: 0 - success, otherwise - failure + + @ref Backend +**/ +int CeedRestoreWorkVector(Ceed ceed, CeedVector *vec) { + if (!ceed->VectorCreate) { + Ceed delegate; + + CeedCall(CeedGetObjectDelegate(ceed, &delegate, "Vector")); + CeedCheck(delegate, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement VectorCreate"); + CeedCall(CeedRestoreWorkVector(delegate, vec)); + CeedCall(CeedDestroy(&delegate)); + return CEED_ERROR_SUCCESS; + } + + for (CeedInt i = 0; i < ceed->work_vectors->num_vecs; i++) { + if (*vec == ceed->work_vectors->vecs[i]) { + CeedCheck(ceed->work_vectors->is_in_use[i], ceed, CEED_ERROR_ACCESS, "Work vector %" CeedSize_FMT " was not checked out but is being returned"); + CeedCall(CeedVectorDestroy(vec)); + ceed->work_vectors->is_in_use[i] = false; + // Note: reduce ref_count again to prevent a ref-loop + CeedObjectDereference((CeedObject)ceed); + return CEED_ERROR_SUCCESS; + } + } + // LCOV_EXCL_START + return CeedError(ceed, CEED_ERROR_MAJOR, "vec was not checked out via CeedGetWorkVector()"); + // LCOV_EXCL_STOP +} + +/** + @brief Retrieve list of additional JiT source roots from `Ceed` context. + + Note: The caller is responsible for restoring `jit_source_roots` with @ref CeedRestoreJitSourceRoots(). + + @param[in] ceed `Ceed` context + @param[out] num_source_roots Number of JiT source directories + @param[out] jit_source_roots Absolute paths to additional JiT source directories + + @return An error code: 0 - success, otherwise - failure + + @ref Backend +**/ +int CeedGetJitSourceRoots(Ceed ceed, CeedInt *num_source_roots, const char ***jit_source_roots) { + Ceed ceed_parent; + + CeedCall(CeedGetParent(ceed, &ceed_parent)); + *num_source_roots = ceed_parent->num_jit_source_roots; + *jit_source_roots = (const char **)ceed_parent->jit_source_roots; + ceed_parent->num_jit_source_roots_readers++; + CeedCall(CeedDestroy(&ceed_parent)); + return CEED_ERROR_SUCCESS; +} + +/** + @brief Retrieve list of additional Rust source roots from `Ceed` context. + + Note: The caller is responsible for restoring `rust_source_roots` with @ref CeedRestoreRustSourceRoots(). + + @param[in] ceed `Ceed` context + @param[out] num_source_roots Number of JiT source directories + @param[out] rust_source_roots Absolute paths to additional Rust source directories + + @return An error code: 0 - success, otherwise - failure + + @ref Backend +**/ +int CeedGetRustSourceRoots(Ceed ceed, CeedInt *num_source_roots, const char ***rust_source_roots) { + Ceed ceed_parent; + + CeedCall(CeedGetParent(ceed, &ceed_parent)); + *num_source_roots = ceed_parent->num_rust_source_roots; + *rust_source_roots = (const char **)ceed_parent->rust_source_roots; + ceed_parent->num_rust_source_roots_readers++; + CeedCall(CeedDestroy(&ceed_parent)); + return CEED_ERROR_SUCCESS; +} + +/** + @brief Restore list of additional JiT source roots from with @ref CeedGetJitSourceRoots() + + @param[in] ceed `Ceed` context + @param[out] jit_source_roots Absolute paths to additional JiT source directories + + @return An error code: 0 - success, otherwise - failure + + @ref Backend +**/ +int CeedRestoreJitSourceRoots(Ceed ceed, const char ***jit_source_roots) { + Ceed ceed_parent; + + CeedCall(CeedGetParent(ceed, &ceed_parent)); + *jit_source_roots = NULL; + ceed_parent->num_jit_source_roots_readers--; + CeedCall(CeedDestroy(&ceed_parent)); + return CEED_ERROR_SUCCESS; +} + +/** + @brief Restore list of additional Rust source roots from with @ref CeedGetJitSourceRoots() + + @param[in] ceed `Ceed` context + @param[out] rust_source_roots Absolute paths to additional Rust source directories + + @return An error code: 0 - success, otherwise - failure + + @ref Backend +**/ +int CeedRestoreRustSourceRoots(Ceed ceed, const char ***rust_source_roots) { + Ceed ceed_parent; + + CeedCall(CeedGetParent(ceed, &ceed_parent)); + *rust_source_roots = NULL; + ceed_parent->num_rust_source_roots_readers--; + CeedCall(CeedDestroy(&ceed_parent)); + return CEED_ERROR_SUCCESS; +} + +/** + @brief Retrieve list of additional JiT defines from `Ceed` context. + + Note: The caller is responsible for restoring `jit_defines` with @ref CeedRestoreJitDefines(). + + @param[in] ceed `Ceed` context + @param[out] num_jit_defines Number of JiT defines + @param[out] jit_defines Strings such as `foo=bar`, used as `-Dfoo=bar` in JiT + + @return An error code: 0 - success, otherwise - failure + + @ref Backend +**/ +int CeedGetJitDefines(Ceed ceed, CeedInt *num_jit_defines, const char ***jit_defines) { + Ceed ceed_parent; + + CeedCall(CeedGetParent(ceed, &ceed_parent)); + *num_jit_defines = ceed_parent->num_jit_defines; + *jit_defines = (const char **)ceed_parent->jit_defines; + ceed_parent->num_jit_defines_readers++; + CeedCall(CeedDestroy(&ceed_parent)); + return CEED_ERROR_SUCCESS; +} + +/** + @brief Restore list of additional JiT defines from with @ref CeedGetJitDefines() + + @param[in] ceed `Ceed` context + @param[out] jit_defines String such as `foo=bar`, used as `-Dfoo=bar` in JiT + + @return An error code: 0 - success, otherwise - failure + + @ref Backend +**/ +int CeedRestoreJitDefines(Ceed ceed, const char ***jit_defines) { + Ceed ceed_parent; + + CeedCall(CeedGetParent(ceed, &ceed_parent)); + *jit_defines = NULL; + ceed_parent->num_jit_defines_readers--; + CeedCall(CeedDestroy(&ceed_parent)); return CEED_ERROR_SUCCESS; } @@ -892,15 +1247,16 @@ int CeedInit(const char *resource, Ceed *ceed) { // Setup Ceed CeedCall(CeedCalloc(1, ceed)); + CeedCall(CeedObjectCreate(NULL, CeedView_Object, CeedDestroy_Object, &(*ceed)->obj)); CeedCall(CeedCalloc(1, &(*ceed)->jit_source_roots)); + CeedCall(CeedCalloc(1, &(*ceed)->rust_source_roots)); const char *ceed_error_handler = getenv("CEED_ERROR_HANDLER"); if (!ceed_error_handler) ceed_error_handler = "abort"; if (!strcmp(ceed_error_handler, "exit")) (*ceed)->Error = CeedErrorExit; else if (!strcmp(ceed_error_handler, "store")) (*ceed)->Error = CeedErrorStore; else (*ceed)->Error = CeedErrorAbort; memcpy((*ceed)->err_msg, "No error message stored", 24); - (*ceed)->ref_count = 1; - (*ceed)->data = NULL; + (*ceed)->data = NULL; // Set lookup table FOffset f_offsets[] = { @@ -924,9 +1280,11 @@ int CeedInit(const char *resource, Ceed *ceed) { CEED_FTABLE_ENTRY(Ceed, CompositeOperatorCreate), CEED_FTABLE_ENTRY(CeedVector, HasValidArray), CEED_FTABLE_ENTRY(CeedVector, HasBorrowedArrayOfType), + CEED_FTABLE_ENTRY(CeedVector, CopyStrided), CEED_FTABLE_ENTRY(CeedVector, SetArray), CEED_FTABLE_ENTRY(CeedVector, TakeArray), CEED_FTABLE_ENTRY(CeedVector, SetValue), + CEED_FTABLE_ENTRY(CeedVector, SetValueStrided), CEED_FTABLE_ENTRY(CeedVector, SyncArray), CEED_FTABLE_ENTRY(CeedVector, GetArray), CEED_FTABLE_ENTRY(CeedVector, GetArrayRead), @@ -948,9 +1306,12 @@ int CeedInit(const char *resource, Ceed *ceed) { CEED_FTABLE_ENTRY(CeedElemRestriction, GetOffsets), CEED_FTABLE_ENTRY(CeedElemRestriction, GetOrientations), CEED_FTABLE_ENTRY(CeedElemRestriction, GetCurlOrientations), + CEED_FTABLE_ENTRY(CeedElemRestriction, GetAtPointsElementOffset), CEED_FTABLE_ENTRY(CeedElemRestriction, Destroy), CEED_FTABLE_ENTRY(CeedBasis, Apply), + CEED_FTABLE_ENTRY(CeedBasis, ApplyAdd), CEED_FTABLE_ENTRY(CeedBasis, ApplyAtPoints), + CEED_FTABLE_ENTRY(CeedBasis, ApplyAddAtPoints), CEED_FTABLE_ENTRY(CeedBasis, Destroy), CEED_FTABLE_ENTRY(CeedTensorContract, Apply), CEED_FTABLE_ENTRY(CeedTensorContract, Destroy), @@ -990,10 +1351,6 @@ int CeedInit(const char *resource, Ceed *ceed) { CeedCall(CeedCalloc(sizeof(f_offsets), &(*ceed)->f_offsets)); memcpy((*ceed)->f_offsets, f_offsets, sizeof(f_offsets)); - // Set fallback for advanced CeedOperator functions - const char fallback_resource[] = ""; - CeedCall(CeedSetOperatorFallbackResource(*ceed, fallback_resource)); - // Record env variables CEED_DEBUG or DBG (*ceed)->is_debug = getenv("CEED_DEBUG") || getenv("DEBUG") || getenv("DBG"); @@ -1004,6 +1361,16 @@ int CeedInit(const char *resource, Ceed *ceed) { // Note: there will always be the default root for every Ceed but all additional paths are added to the top-most parent CeedCall(CeedAddJitSourceRoot(*ceed, (char *)CeedJitSourceRootDefault)); + // By default, make cuda compile without clang, use nvrtc instead + // Note that this is overridden if a rust file is included (rust requires clang) + const char *env = getenv("GPU_CLANG"); + + if (env && strcmp(env, "1") == 0) { + (*ceed)->cuda_compile_with_clang = true; + } else { + (*ceed)->cuda_compile_with_clang = false; + } + // Backend specific setup CeedCall(backends[match_index].init(&resource[match_help], *ceed)); return CEED_ERROR_SUCCESS; @@ -1029,6 +1396,7 @@ int CeedSetStream(Ceed ceed, void *handle) { if (delegate) CeedCall(CeedSetStream(delegate, handle)); else return CeedError(ceed, CEED_ERROR_UNSUPPORTED, "Backend does not support setting stream"); + CeedCall(CeedDestroy(&delegate)); } return CEED_ERROR_SUCCESS; } @@ -1092,6 +1460,7 @@ int CeedGetPreferredMemType(Ceed ceed, CeedMemType *mem_type) { } else { *mem_type = CEED_MEM_HOST; } + CeedCall(CeedDestroy(&delegate)); } return CEED_ERROR_SUCCESS; } @@ -1125,14 +1494,114 @@ int CeedAddJitSourceRoot(Ceed ceed, const char *jit_source_root) { Ceed ceed_parent; CeedCall(CeedGetParent(ceed, &ceed_parent)); + CeedCheck(!ceed_parent->num_jit_source_roots_readers, ceed, CEED_ERROR_ACCESS, "Cannot add JiT source root, read access has not been restored"); CeedInt index = ceed_parent->num_jit_source_roots; size_t path_length = strlen(jit_source_root); - CeedCall(CeedRealloc(index + 1, &ceed_parent->jit_source_roots)); + if (ceed_parent->num_jit_source_roots == ceed_parent->max_jit_source_roots) { + if (ceed_parent->max_jit_source_roots == 0) ceed_parent->max_jit_source_roots = 1; + ceed_parent->max_jit_source_roots *= 2; + CeedCall(CeedRealloc(ceed_parent->max_jit_source_roots, &ceed_parent->jit_source_roots)); + } CeedCall(CeedCalloc(path_length + 1, &ceed_parent->jit_source_roots[index])); memcpy(ceed_parent->jit_source_roots[index], jit_source_root, path_length); ceed_parent->num_jit_source_roots++; + CeedCall(CeedDestroy(&ceed_parent)); + return CEED_ERROR_SUCCESS; +} + +/** + @brief Set additional Rust source root for `Ceed` context for use in QFunction + + @param[in,out] ceed `Ceed` context + @param[in] rust_source_root Absolute path to additional Rust source directory + + @return An error code: 0 - success, otherwise - failure + + @ref User +**/ +int CeedAddRustSourceRoot(Ceed ceed, const char *rust_source_root) { + Ceed ceed_parent; + + CeedCall(CeedGetParent(ceed, &ceed_parent)); + CeedCheck(!ceed_parent->num_rust_source_roots_readers, ceed, CEED_ERROR_ACCESS, "Cannot add Rust source root, read access has not been restored"); + + CeedInt index = ceed_parent->num_rust_source_roots; + size_t path_length = strlen(rust_source_root); + + if (ceed_parent->num_rust_source_roots == ceed_parent->max_rust_source_roots) { + if (ceed_parent->max_rust_source_roots == 0) ceed_parent->max_rust_source_roots = 1; + ceed_parent->max_rust_source_roots *= 2; + CeedCall(CeedRealloc(ceed_parent->max_rust_source_roots, &ceed_parent->rust_source_roots)); + } + CeedCall(CeedCalloc(path_length + 1, &ceed_parent->rust_source_roots[index])); + memcpy(ceed_parent->rust_source_roots[index], rust_source_root, path_length); + ceed_parent->num_rust_source_roots++; + ceed_parent->cuda_compile_with_clang = true; + ceed->cuda_compile_with_clang = true; + CeedCall(CeedDestroy(&ceed_parent)); + return CEED_ERROR_SUCCESS; +} + +/** + @brief Set additional JiT compiler define for `Ceed` context + + @param[in,out] ceed `Ceed` context + @param[in] jit_define String such as `foo=bar`, used as `-Dfoo=bar` in JiT + + @return An error code: 0 - success, otherwise - failure + + @ref User +**/ +int CeedAddJitDefine(Ceed ceed, const char *jit_define) { + Ceed ceed_parent; + + CeedCall(CeedGetParent(ceed, &ceed_parent)); + CeedCheck(!ceed_parent->num_jit_defines_readers, ceed, CEED_ERROR_ACCESS, "Cannot add JiT define, read access has not been restored"); + + CeedInt index = ceed_parent->num_jit_defines; + size_t define_length = strlen(jit_define); + + if (ceed_parent->num_jit_defines == ceed_parent->max_jit_defines) { + if (ceed_parent->max_jit_defines == 0) ceed_parent->max_jit_defines = 1; + ceed_parent->max_jit_defines *= 2; + CeedCall(CeedRealloc(ceed_parent->max_jit_defines, &ceed_parent->jit_defines)); + } + CeedCall(CeedCalloc(define_length + 1, &ceed_parent->jit_defines[index])); + memcpy(ceed_parent->jit_defines[index], jit_define, define_length); + ceed_parent->num_jit_defines++; + CeedCall(CeedDestroy(&ceed_parent)); + return CEED_ERROR_SUCCESS; +} + +/** + @brief Set the number of tabs to indent for @ref CeedView() output + + @param[in] ceed `Ceed` to set the number of view tabs + @param[in] num_tabs Number of view tabs to set + + @return Error code: 0 - success, otherwise - failure + + @ref User +**/ +int CeedSetNumViewTabs(Ceed ceed, CeedInt num_tabs) { + CeedCall(CeedObjectSetNumViewTabs((CeedObject)ceed, num_tabs)); + return CEED_ERROR_SUCCESS; +} + +/** + @brief Get the number of tabs to indent for @ref CeedView() output + + @param[in] ceed `Ceed` to get the number of view tabs + @param[out] num_tabs Number of view tabs + + @return Error code: 0 - success, otherwise - failure + + @ref User +**/ +int CeedGetNumViewTabs(Ceed ceed, CeedInt *num_tabs) { + CeedCall(CeedObjectGetNumViewTabs((CeedObject)ceed, num_tabs)); return CEED_ERROR_SUCCESS; } @@ -1147,15 +1616,24 @@ int CeedAddJitSourceRoot(Ceed ceed, const char *jit_source_root) { @ref User **/ int CeedView(Ceed ceed, FILE *stream) { + char *tabs = NULL; CeedMemType mem_type; CeedCall(CeedGetPreferredMemType(ceed, &mem_type)); + { + CeedInt num_tabs = 0; + + CeedCall(CeedGetNumViewTabs(ceed, &num_tabs)); + CeedCall(CeedCalloc(CEED_TAB_WIDTH * num_tabs + 1, &tabs)); + for (CeedInt i = 0; i < CEED_TAB_WIDTH * num_tabs; i++) tabs[i] = ' '; + } fprintf(stream, - "Ceed\n" - " Ceed Resource: %s\n" - " Preferred MemType: %s\n", - ceed->resource, CeedMemTypes[mem_type]); + "%sCeed\n" + "%s Ceed Resource: %s\n" + "%s Preferred MemType: %s\n", + tabs, tabs, ceed->resource, tabs, CeedMemTypes[mem_type]); + CeedCall(CeedFree(&tabs)); return CEED_ERROR_SUCCESS; } @@ -1169,10 +1647,15 @@ int CeedView(Ceed ceed, FILE *stream) { @ref User **/ int CeedDestroy(Ceed *ceed) { - if (!*ceed || --(*ceed)->ref_count > 0) { + if (!*ceed || CeedObjectDereference((CeedObject)*ceed) > 0) { *ceed = NULL; return CEED_ERROR_SUCCESS; } + + CeedCheck(!(*ceed)->num_jit_source_roots_readers, *ceed, CEED_ERROR_ACCESS, + "Cannot destroy ceed context, read access for JiT source roots has been granted"); + CeedCheck(!(*ceed)->num_jit_defines_readers, *ceed, CEED_ERROR_ACCESS, "Cannot add JiT source root, read access for JiT defines has been granted"); + if ((*ceed)->delegate) CeedCall(CeedDestroy(&(*ceed)->delegate)); if ((*ceed)->obj_delegate_count > 0) { @@ -1190,10 +1673,21 @@ int CeedDestroy(Ceed *ceed) { } CeedCall(CeedFree(&(*ceed)->jit_source_roots)); + for (CeedInt i = 0; i < (*ceed)->num_jit_defines; i++) { + CeedCall(CeedFree(&(*ceed)->jit_defines[i])); + } + CeedCall(CeedFree(&(*ceed)->jit_defines)); + + for (CeedInt i = 0; i < (*ceed)->num_rust_source_roots; i++) { + CeedCall(CeedFree(&(*ceed)->rust_source_roots[i])); + } + CeedCall(CeedFree(&(*ceed)->rust_source_roots)); + CeedCall(CeedFree(&(*ceed)->f_offsets)); CeedCall(CeedFree(&(*ceed)->resource)); CeedCall(CeedDestroy(&(*ceed)->op_fallback_ceed)); - CeedCall(CeedFree(&(*ceed)->op_fallback_resource)); + CeedCall(CeedWorkVectorsDestroy(*ceed)); + CeedCall(CeedObjectDestroy_Private(&(*ceed)->obj)); CeedCall(CeedFree(ceed)); return CEED_ERROR_SUCCESS; } @@ -1201,7 +1695,6 @@ int CeedDestroy(Ceed *ceed) { // LCOV_EXCL_START const char *CeedErrorFormat(Ceed ceed, const char *format, va_list *args) { if (ceed->parent) return CeedErrorFormat(ceed->parent, format, args); - if (ceed->op_fallback_parent) return CeedErrorFormat(ceed->op_fallback_parent, format, args); // Using pointer to va_list for better FFI, but clang-tidy can't verify va_list is initalized vsnprintf(ceed->err_msg, CEED_MAX_RESOURCE_LEN, format, *args); // NOLINT return ceed->err_msg; @@ -1265,7 +1758,6 @@ int CeedErrorReturn(Ceed ceed, const char *filename, int line_no, const char *fu // LCOV_EXCL_START int CeedErrorStore(Ceed ceed, const char *filename, int line_no, const char *func, int err_code, const char *format, va_list *args) { if (ceed->parent) return CeedErrorStore(ceed->parent, filename, line_no, func, err_code, format, args); - if (ceed->op_fallback_parent) return CeedErrorStore(ceed->op_fallback_parent, filename, line_no, func, err_code, format, args); // Build message int len = snprintf(ceed->err_msg, CEED_MAX_RESOURCE_LEN, "%s:%d in %s(): ", filename, line_no, func); @@ -1345,7 +1837,6 @@ int CeedSetErrorHandler(Ceed ceed, CeedErrorHandler handler) { **/ int CeedGetErrorMessage(Ceed ceed, const char **err_msg) { if (ceed->parent) return CeedGetErrorMessage(ceed->parent, err_msg); - if (ceed->op_fallback_parent) return CeedGetErrorMessage(ceed->op_fallback_parent, err_msg); *err_msg = ceed->err_msg; return CEED_ERROR_SUCCESS; } @@ -1364,7 +1855,6 @@ int CeedGetErrorMessage(Ceed ceed, const char **err_msg) { **/ int CeedResetErrorMessage(Ceed ceed, const char **err_msg) { if (ceed->parent) return CeedResetErrorMessage(ceed->parent, err_msg); - if (ceed->op_fallback_parent) return CeedResetErrorMessage(ceed->op_fallback_parent, err_msg); *err_msg = NULL; memcpy(ceed->err_msg, "No error message stored", 24); return CEED_ERROR_SUCCESS; @@ -1387,7 +1877,7 @@ int CeedResetErrorMessage(Ceed ceed, const char **err_msg) { @ref Developer - @sa CEED_VERSION_GE() + @sa CEED_VERSION_GE() CeedGetGitVersion() CeedGetBuildConfiguration() */ int CeedGetVersion(int *major, int *minor, int *patch, bool *release) { if (major) *major = CEED_VERSION_MAJOR; diff --git a/julia/LibCEED.jl/examples/ex3-volume.jl b/julia/LibCEED.jl/examples/ex3-volume.jl new file mode 100644 index 0000000000..68edf59817 --- /dev/null +++ b/julia/LibCEED.jl/examples/ex3-volume.jl @@ -0,0 +1,197 @@ +using LibCEED, Printf + +include("common.jl") + +function transform_mesh_coords!(dim, mesh_size, mesh_coords) + @witharray coords = mesh_coords begin + if dim == 1 + for i = 1:mesh_size + # map [0,1] to [0,1] varying the mesh density + coords[i] = 0.5 + 1.0/sqrt(3.0)*sin((2.0/3.0)*pi*(coords[i] - 0.5)) + end + exact_volume = 1.0 + else + num_nodes = mesh_size÷dim + @inbounds @simd for i = 1:num_nodes + # map (x,y) from [0,1]x[0,1] to the quarter annulus with polar + # coordinates, (r,phi) in [1,2]x[0,pi/2] with area = 3/4*pi + u = coords[i] + v = coords[i+num_nodes] + u = 1.0 + u + v = pi/2*v + coords[i] = u*cos(v) + coords[i+num_nodes] = u*sin(v) + end + exact_volume = 3.0/4.0*pi + end + return exact_volume + end +end + +function run_ex3(; ceed_spec, dim, mesh_order, sol_order, num_qpts, prob_size) + ncompx = dim + prob_size < 0 && (prob_size = 256*1024) + + ceed = Ceed(ceed_spec) + mesh_basis = + create_tensor_h1_lagrange_basis(ceed, dim, ncompx, mesh_order + 1, num_qpts, GAUSS) + sol_basis = + create_tensor_h1_lagrange_basis(ceed, dim, 1, sol_order + 1, num_qpts, GAUSS) + + # Determine the mesh size based on the given approximate problem size. + nxyz = get_cartesian_mesh_size(dim, sol_order, prob_size) + println("Mesh size: ", nxyz) + + # Build CeedElemRestriction objects describing the mesh and solution discrete + # representations. + mesh_size, mesh_rstr, _ = + build_cartesian_restriction(ceed, dim, nxyz, mesh_order, ncompx, num_qpts) + num_q_comp = 1 + div(dim*(dim + 1), 2) + sol_size, _, qdata_rstr_i = build_cartesian_restriction( + ceed, + dim, + nxyz, + sol_order, + num_q_comp, + num_qpts, + mode=StridedOnly, + ) + sol_size, sol_rstr, sol_rstr_i = build_cartesian_restriction( + ceed, + dim, + nxyz, + sol_order, + 1, + num_qpts, + mode=RestrictionAndStrided, + ) + println("Number of mesh nodes : ", div(mesh_size, dim)) + println("Number of solution nodes : ", sol_size) + + # Create a CeedVector with the mesh coordinates. + mesh_coords = CeedVector(ceed, mesh_size) + set_cartesian_mesh_coords!(dim, nxyz, mesh_order, mesh_coords) + # Apply a transformation to the mesh. + exact_vol = transform_mesh_coords!(dim, mesh_size, mesh_coords) + + #Create the Q-function that builds the mass+diffusion operator ( i.e it computes the quadrature data) and set its context data. + @interior_qf build_qfunc = ( + ceed, + dim=dim, + (dx, :in, EVAL_GRAD, dim, dim), # ← THIS LINE: dx input + (weights, :in, EVAL_WEIGHT), # ← weights input + (qdata, :out, EVAL_NONE, num_q_comp), # ← qdata output + begin + # Compute determinant + det_J = det(dx) + + # Store mass component + qdata[1] = weights*det_J + + # Store diffusion components (J^T * J) + idx = 2 + for i = 1:dim + for j = i:dim + qdata[idx] = dx[:, i]'*dx[:, j] + idx += 1 + end + end + end, + ) + + # Create the operator that builds the quadrature data for the mass+diffusion operator. + build_oper = Operator( + ceed, + qf=build_qfunc, + fields=[ + (:dx, mesh_rstr, mesh_basis, CeedVectorActive()), + (:weights, ElemRestrictionNone(), mesh_basis, CeedVectorNone()), + (:qdata, qdata_rstr_i, BasisNone(), CeedVectorActive()), + ], + ) + + # Compute the quadrature data for the mass+diff operator. + elem_qpts = num_qpts^dim + num_elem = prod(nxyz) + qdata = CeedVector(ceed, num_elem*elem_qpts*num_q_comp) + print("Computing the quadrature data for the mass+diffusion operator ...") + flush(stdout) + apply!(build_oper, mesh_coords, qdata) + println(" done.") + + # Create the Q-function that defines the action of the mass+diffusion operator. + @interior_qf apply_qfunc = ( + ceed, + dim=dim, + (u, :in, EVAL_INTERP), + (du, :in, EVAL_GRAD, dim), + (qdata, :in, EVAL_NONE, num_q_comp), + (v, :out, EVAL_INTERP), + (dv, :out, EVAL_GRAD, dim), + begin + # Apply mass: v = qdata[1] * u + v .= qdata[1].*u + + # Apply diffusion: dv = (qdata[2:end]) * du + # The qdata contains the symmetric diffusion tensor (J^T*J) + # dv_i = sum_j (J^T*J)_{i,j} * du_j + + # For efficiency, rebuild the matrix from stored components + idx = 2 + for i = 1:dim + dv_i = 0.0 + for j = 1:dim + # Reconstruct symmetric matrix element + if j >= i + mat_idx = idx + div((j - 1)*j, 2) + (i - 1) + else + mat_idx = idx + div((i - 1)*i, 2) + (j - 1) + end + dv_i += qdata[mat_idx]*du[j] + end + dv[i] = dv_i + end + end, + ) + + # Create the mass+diffusion operator. + oper = Operator( + ceed, + qf=apply_qfunc, + fields=[ + (:u, sol_rstr, sol_basis, CeedVectorActive()), + (:du, sol_rstr, sol_basis, CeedVectorActive()), + (:qdata, qdata_rstr_i, BasisNone(), qdata), + (:v, sol_rstr, sol_basis, CeedVectorActive()), + (:dv, sol_rstr, sol_basis, CeedVectorActive()), + ], + ) + + # Compute the mesh volume using the mass+diffusion operator: vol = 1^T \cdot (M + K) \cdot 1 + print("Computing the mesh volume using the formula: vol = 1^T * (M + K) * 1...") + flush(stdout) + # Create auxiliary solution-size vectors. + u = CeedVector(ceed, sol_size) + v = CeedVector(ceed, sol_size) + # Initialize 'u' with ones. + u[] = 1.0 + # Apply the mass+diffusion operator: 'u' -> 'v'. + apply!(oper, u, v) + # Compute and print the sum of the entries of 'v' giving the mesh volume. + vol = witharray_read(sum, v, MEM_HOST) + + println(" done.") + @printf("Exact mesh volume : % .14g\n", exact_vol) + @printf("Computed mesh volume : % .14g\n", vol) + @printf("Volume error : % .14g\n", vol - exact_vol) +end + +# Entry point +run_ex3( + ceed_spec="/cpu/self", + dim=3, + mesh_order=4, + sol_order=4, + num_qpts=4 + 2, + prob_size=-1, +) diff --git a/julia/LibCEED.jl/src/Operator.jl b/julia/LibCEED.jl/src/Operator.jl index d1de710c54..2ad8f41b1c 100644 --- a/julia/LibCEED.jl/src/Operator.jl +++ b/julia/LibCEED.jl/src/Operator.jl @@ -69,11 +69,11 @@ collection `ops`. """ function create_composite_operator(c::Ceed, ops) ref = Ref{C.CeedOperator}() - C.CeedCompositeOperatorCreate(c[], ref) + C.CeedOperatorCreateComposite(c[], ref) comp_op = Operator(ref, QFunctionNone(), QFunctionNone(), QFunctionNone()) comp_op.sub_ops = ops for op ∈ ops - C.CeedCompositeOperatorAddSub(comp_op[], op[]) + C.CeedOperatorCompositeAddSub(comp_op[], op[]) end comp_op end diff --git a/julia/LibCEED.jl/src/generated/libceed_bindings.jl b/julia/LibCEED.jl/src/generated/libceed_bindings.jl index f814609b86..d4bba38974 100644 --- a/julia/LibCEED.jl/src/generated/libceed_bindings.jl +++ b/julia/LibCEED.jl/src/generated/libceed_bindings.jl @@ -436,8 +436,8 @@ function CeedBasisApply(basis, num_elem, t_mode, eval_mode, u, v) ccall((:CeedBasisApply, libceed), Cint, (CeedBasis, CeedInt, CeedTransposeMode, CeedEvalMode, CeedVector, CeedVector), basis, num_elem, t_mode, eval_mode, u, v) end -function CeedBasisApplyAtPoints(basis, num_points, t_mode, eval_mode, x_ref, u, v) - ccall((:CeedBasisApplyAtPoints, libceed), Cint, (CeedBasis, CeedInt, CeedTransposeMode, CeedEvalMode, CeedVector, CeedVector, CeedVector), basis, num_points, t_mode, eval_mode, x_ref, u, v) +function CeedBasisApplyAtPoints(basis, num_elem, num_points, t_mode, eval_mode, x_ref, u, v) + ccall((:CeedBasisApplyAtPoints, libceed), Cint, (CeedBasis, CeedInt, Ptr{CeedInt}, CeedTransposeMode, CeedEvalMode, CeedVector, CeedVector, CeedVector), basis, num_elem, num_points, t_mode, eval_mode, x_ref, u, v) end function CeedBasisGetCeed(basis, ceed) @@ -658,8 +658,8 @@ function CeedOperatorCreate(ceed, qf, dqf, dqfT, op) ccall((:CeedOperatorCreate, libceed), Cint, (Ceed, CeedQFunction, CeedQFunction, CeedQFunction, Ptr{CeedOperator}), ceed, qf, dqf, dqfT, op) end -function CeedCompositeOperatorCreate(ceed, op) - ccall((:CeedCompositeOperatorCreate, libceed), Cint, (Ceed, Ptr{CeedOperator}), ceed, op) +function CeedOperatorCreateComposite(ceed, op) + ccall((:CeedOperatorCreateComposite, libceed), Cint, (Ceed, Ptr{CeedOperator}), ceed, op) end function CeedOperatorReferenceCopy(op, op_copy) @@ -674,16 +674,16 @@ function CeedOperatorGetFields(op, num_input_fields, input_fields, num_output_fi ccall((:CeedOperatorGetFields, libceed), Cint, (CeedOperator, Ptr{CeedInt}, Ptr{Ptr{CeedOperatorField}}, Ptr{CeedInt}, Ptr{Ptr{CeedOperatorField}}), op, num_input_fields, input_fields, num_output_fields, output_fields) end -function CeedCompositeOperatorAddSub(composite_op, sub_op) - ccall((:CeedCompositeOperatorAddSub, libceed), Cint, (CeedOperator, CeedOperator), composite_op, sub_op) +function CeedOperatorCompositeAddSub(composite_op, sub_op) + ccall((:CeedOperatorCompositeAddSub, libceed), Cint, (CeedOperator, CeedOperator), composite_op, sub_op) end -function CeedCompositeOperatorGetNumSub(op, num_suboperators) - ccall((:CeedCompositeOperatorGetNumSub, libceed), Cint, (CeedOperator, Ptr{CeedInt}), op, num_suboperators) +function CeedOperatorCompositeGetNumSub(op, num_suboperators) + ccall((:CeedOperatorCompositeGetNumSub, libceed), Cint, (CeedOperator, Ptr{CeedInt}), op, num_suboperators) end -function CeedCompositeOperatorGetSubList(op, sub_operators) - ccall((:CeedCompositeOperatorGetSubList, libceed), Cint, (CeedOperator, Ptr{Ptr{CeedOperator}}), op, sub_operators) +function CeedOperatorCompositeGetSubList(op, sub_operators) + ccall((:CeedOperatorCompositeGetSubList, libceed), Cint, (CeedOperator, Ptr{Ptr{CeedOperator}}), op, sub_operators) end function CeedOperatorCheckReady(op) @@ -738,8 +738,8 @@ function CeedOperatorLinearAssemble(op, values) ccall((:CeedOperatorLinearAssemble, libceed), Cint, (CeedOperator, CeedVector), op, values) end -function CeedCompositeOperatorGetMultiplicity(op, num_skip_indices, skip_indices, mult) - ccall((:CeedCompositeOperatorGetMultiplicity, libceed), Cint, (CeedOperator, CeedInt, Ptr{CeedInt}, CeedVector), op, num_skip_indices, skip_indices, mult) +function CeedOperatorCompositeGetMultiplicity(op, num_skip_indices, skip_indices, mult) + ccall((:CeedOperatorCompositeGetMultiplicity, libceed), Cint, (CeedOperator, CeedInt, Ptr{CeedInt}, CeedVector), op, num_skip_indices, skip_indices, mult) end function CeedOperatorMultigridLevelCreate(op_fine, p_mult_fine, rstr_coarse, basis_coarse, op_coarse, op_prolong, op_restrict) diff --git a/julia/LibCEED.jl/test/rundevtests.jl b/julia/LibCEED.jl/test/rundevtests.jl index 9527d8d2dc..59d0e4840e 100644 --- a/julia/LibCEED.jl/test/rundevtests.jl +++ b/julia/LibCEED.jl/test/rundevtests.jl @@ -8,4 +8,36 @@ function checkoutput(str, fname) return true end -@testset "LibCEED Development Tests" begin end +@testset "LibCEED Development Tests" begin + @testset "Operator" begin + c = Ceed() + @interior_qf id = ( + c, + (input, :in, EVAL_INTERP), + (output, :out, EVAL_INTERP), + begin + output[] = input + end, + ) + b = create_tensor_h1_lagrange_basis(c, 3, 1, 3, 3, GAUSS_LOBATTO) + n = getnumnodes(b) + offsets = Vector{CeedInt}(0:n-1) + r = create_elem_restriction(c, 1, n, 1, 1, n, offsets) + op = Operator( + c; + qf=id, + fields=[ + (:input, r, b, CeedVectorActive()), + (:output, r, b, CeedVectorActive()), + ], + ) + + v = rand(CeedScalar, n) + v1 = CeedVector(c, v) + v2 = CeedVector(c, n) + + comp_op = create_composite_operator(c, [op]) + apply!(comp_op, v1, v2) + @test @witharray_read(a1 = v1, @witharray_read(a2 = v2, a1 == a2)) + end +end diff --git a/julia/LibCEED.jl/test/runtests.jl b/julia/LibCEED.jl/test/runtests.jl index 83c7598ecd..724240d786 100644 --- a/julia/LibCEED.jl/test/runtests.jl +++ b/julia/LibCEED.jl/test/runtests.jl @@ -256,10 +256,6 @@ else LibCEED.assemble_add_diagonal!(op, diag_vector) @test @witharray(a = diag_vector, a == fill(1.0, n)) - comp_op = create_composite_operator(c, [op]) - apply!(comp_op, v1, v2) - @test @witharray_read(a1 = v1, @witharray_read(a2 = v2, a1 == a2)) - @test showstr(op) == """ CeedOperator 1 elements with 27 quadrature points each diff --git a/python/__init__.py b/python/__init__.py index 9c6560addb..c5eb31d18e 100644 --- a/python/__init__.py +++ b/python/__init__.py @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors +# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors # All Rights Reserved. See the top-level LICENSE and NOTICE files for details. # # SPDX-License-Identifier: BSD-2-Clause diff --git a/python/build_ceed_cffi.py b/python/build_ceed_cffi.py index 6a77781d25..71c99a21f4 100644 --- a/python/build_ceed_cffi.py +++ b/python/build_ceed_cffi.py @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors +# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors # All Rights Reserved. See the top-level LICENSE and NOTICE files for details. # # SPDX-License-Identifier: BSD-2-Clause @@ -13,6 +13,29 @@ ceed_version_ge = re.compile(r'\s+\(!?CEED_VERSION.*') +# Checks to see if a c line is part of the lines we have to exclude (macros) +def is_valid_line(line): + if (line.startswith("#") and not line.startswith("#include")): + return False + if (line.startswith("#include \"deprecated.h\"")): + return False + if (line.startswith(" CEED_QFUNCTION_ATTR")): + return False + if (line.startswith(" static const char")): + return False + if (line.endswith('\\\n')): + return False + if ("CeedErrorImpl" in line): + return False + if (r'const char *, ...);' in line): + return False + if (line.startswith("CEED_EXTERN const char *const")): + return False + if (ceed_version_ge.match(line)): + return False + return True + + def get_ceed_dirs(): here = os.path.dirname(os.path.abspath(__file__)) prefix = os.path.dirname(here) @@ -31,14 +54,7 @@ def get_ceed_dirs(): lines = [] for header_path in ["include/ceed/types.h", "include/ceed/ceed.h"]: with open(os.path.abspath(header_path)) as f: - lines += [line.strip() for line in f if - not (line.startswith("#") and not line.startswith("#include")) and - not line.startswith(" static") and - not line.startswith(" CEED_QFUNCTION_ATTR") and - "CeedErrorImpl" not in line and - "const char *, ...);" not in line and - not line.startswith("CEED_EXTERN const char *const") and - not ceed_version_ge.match(line)] + lines += [line.strip() for line in f if is_valid_line(line)] lines = [line.replace("CEED_EXTERN", "extern") for line in lines] # Find scalar type inclusion line and insert definitions diff --git a/python/ceed.py b/python/ceed.py index 092cd1d047..8df025acae 100644 --- a/python/ceed.py +++ b/python/ceed.py @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors +# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors # All Rights Reserved. See the top-level LICENSE and NOTICE files for details. # # SPDX-License-Identifier: BSD-2-Clause diff --git a/python/ceed_basis.py b/python/ceed_basis.py index e1c12def62..c4f71a7089 100644 --- a/python/ceed_basis.py +++ b/python/ceed_basis.py @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors +# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors # All Rights Reserved. See the top-level LICENSE and NOTICE files for details. # # SPDX-License-Identifier: BSD-2-Clause diff --git a/python/ceed_constants.py b/python/ceed_constants.py index ab99f7b643..8b4ea22673 100644 --- a/python/ceed_constants.py +++ b/python/ceed_constants.py @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors +# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors # All Rights Reserved. See the top-level LICENSE and NOTICE files for details. # # SPDX-License-Identifier: BSD-2-Clause diff --git a/python/ceed_elemrestriction.py b/python/ceed_elemrestriction.py index 9c986eb58e..42e72a9311 100644 --- a/python/ceed_elemrestriction.py +++ b/python/ceed_elemrestriction.py @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors +# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors # All Rights Reserved. See the top-level LICENSE and NOTICE files for details. # # SPDX-License-Identifier: BSD-2-Clause diff --git a/python/ceed_operator.py b/python/ceed_operator.py index 740beef641..90c3549f36 100644 --- a/python/ceed_operator.py +++ b/python/ceed_operator.py @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors +# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors # All Rights Reserved. See the top-level LICENSE and NOTICE files for details. # # SPDX-License-Identifier: BSD-2-Clause @@ -331,7 +331,7 @@ def __init__(self, ceed): # Reference to Ceed self._ceed = ceed # libCEED call - err_code = lib.CeedCompositeOperatorCreate( + err_code = lib.CeedOperatorCreateComposite( self._ceed._pointer[0], self._pointer) self._ceed._check_error(err_code) @@ -343,7 +343,7 @@ def add_sub(self, subop): subop: sub-operator Operator""" # libCEED call - err_code = lib.CeedCompositeOperatorAddSub( + err_code = lib.CeedOperatorCompositeAddSub( self._pointer[0], subop._pointer[0]) self._ceed._check_error(err_code) diff --git a/python/ceed_qfunction.py b/python/ceed_qfunction.py index 896d69bfd4..9c73581ae4 100644 --- a/python/ceed_qfunction.py +++ b/python/ceed_qfunction.py @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors +# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors # All Rights Reserved. See the top-level LICENSE and NOTICE files for details. # # SPDX-License-Identifier: BSD-2-Clause diff --git a/python/ceed_qfunctioncontext.py b/python/ceed_qfunctioncontext.py index 92c072bdd2..b98863aa7d 100644 --- a/python/ceed_qfunctioncontext.py +++ b/python/ceed_qfunctioncontext.py @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors +# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors # All Rights Reserved. See the top-level LICENSE and NOTICE files for details. # # SPDX-License-Identifier: BSD-2-Clause diff --git a/python/ceed_vector.py b/python/ceed_vector.py index c72bb265ad..06bd693ec6 100644 --- a/python/ceed_vector.py +++ b/python/ceed_vector.py @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors +# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors # All Rights Reserved. See the top-level LICENSE and NOTICE files for details. # # SPDX-License-Identifier: BSD-2-Clause diff --git a/python/tests/Makefile b/python/tests/Makefile index 918d1551ef..94c49d5b3a 100644 --- a/python/tests/Makefile +++ b/python/tests/Makefile @@ -1,10 +1,12 @@ -# Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. # All Rights Reserved. See the top-level LICENSE and NOTICE files for details. # # SPDX-License-Identifier: BSD-2-Clause # # This file is part of CEED: http://github.com/ceed +PYTHON ?= python3 + clean: rm -rf build __pycache__ .pytest_cache *.so diff --git a/python/tests/conftest.py b/python/tests/conftest.py index 62e8f4bb1d..70bdf69cfc 100644 --- a/python/tests/conftest.py +++ b/python/tests/conftest.py @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors +# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors # All Rights Reserved. See the top-level LICENSE and NOTICE files for details. # # SPDX-License-Identifier: BSD-2-Clause diff --git a/python/tests/libceed-qfunctions.c b/python/tests/libceed-qfunctions.c index bef055452a..14fdfa6749 100644 --- a/python/tests/libceed-qfunctions.c +++ b/python/tests/libceed-qfunctions.c @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause diff --git a/python/tests/output/test_504.out b/python/tests/output/test_504.out index 3e1d828503..3b8acf130a 100644 --- a/python/tests/output/test_504.out +++ b/python/tests/output/test_504.out @@ -1,4 +1,4 @@ -CeedOperator +CeedOperator - setup_mass 15 elements with 8 quadrature points each 3 fields 2 input fields: @@ -20,7 +20,7 @@ CeedOperator No basis Active vector -CeedOperator +CeedOperator - apply_mass 15 elements with 8 quadrature points each 3 fields 2 input fields: diff --git a/python/tests/setup-qfunctions.py b/python/tests/setup-qfunctions.py index aab21d830a..74074b67c3 100644 --- a/python/tests/setup-qfunctions.py +++ b/python/tests/setup-qfunctions.py @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. # All Rights Reserved. See the top-level LICENSE and NOTICE files for details. # # SPDX-License-Identifier: BSD-2-Clause @@ -16,7 +16,7 @@ qf_module = Extension("libceed_qfunctions", include_dirs=[os.path.join(CEED_DIR, 'include')], sources=["libceed-qfunctions.c"], - extra_compile_args=["-O3", "-std=c99", + extra_compile_args=["-O3", "-std=c11", "-Wno-unused-variable", "-Wno-unused-function"]) diff --git a/python/tests/setup.cfg b/python/tests/setup.cfg index e0bbfb441c..7290d8e331 100644 --- a/python/tests/setup.cfg +++ b/python/tests/setup.cfg @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. # All Rights Reserved. See the top-level LICENSE and NOTICE files for details. # # SPDX-License-Identifier: BSD-2-Clause diff --git a/python/tests/test-0-ceed.py b/python/tests/test-0-ceed.py index b38d31a332..5ab30e1fd9 100644 --- a/python/tests/test-0-ceed.py +++ b/python/tests/test-0-ceed.py @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors +# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors # All Rights Reserved. See the top-level LICENSE and NOTICE files for details. # # SPDX-License-Identifier: BSD-2-Clause @@ -20,7 +20,7 @@ def test_000(ceed_resource): ceed = libceed.Ceed(ceed_resource) # ------------------------------------------------------------------------------- -# Test return of Ceed backend prefered memory type +# Test return of Ceed backend preferred memory type # ------------------------------------------------------------------------------- diff --git a/python/tests/test-1-vector.py b/python/tests/test-1-vector.py index 246b82515e..9838a35b30 100644 --- a/python/tests/test-1-vector.py +++ b/python/tests/test-1-vector.py @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors +# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors # All Rights Reserved. See the top-level LICENSE and NOTICE files for details. # # SPDX-License-Identifier: BSD-2-Clause @@ -359,7 +359,7 @@ def test_126(ceed_resource, capsys): a = np.arange(10, 10 + n, dtype=ceed.scalar_type()) x.set_array(a, cmode=libceed.USE_POINTER) - a2 = np.arange(10, n, dtype=ceed.scalar_type()) + a2 = np.arange(0, n, dtype=ceed.scalar_type()) y.set_array(a2, cmode=libceed.USE_POINTER) y.copy_from(x) diff --git a/python/tests/test-2-elemrestriction.py b/python/tests/test-2-elemrestriction.py index 6f9b1a3c38..60feb73626 100644 --- a/python/tests/test-2-elemrestriction.py +++ b/python/tests/test-2-elemrestriction.py @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors +# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors # All Rights Reserved. See the top-level LICENSE and NOTICE files for details. # # SPDX-License-Identifier: BSD-2-Clause diff --git a/python/tests/test-3-basis.py b/python/tests/test-3-basis.py index 453e0b8401..aaded78b21 100644 --- a/python/tests/test-3-basis.py +++ b/python/tests/test-3-basis.py @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors +# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors # All Rights Reserved. See the top-level LICENSE and NOTICE files for details. # # SPDX-License-Identifier: BSD-2-Clause diff --git a/python/tests/test-4-qfunction.py b/python/tests/test-4-qfunction.py index 42dd844e2f..0491a2c624 100644 --- a/python/tests/test-4-qfunction.py +++ b/python/tests/test-4-qfunction.py @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors +# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors # All Rights Reserved. See the top-level LICENSE and NOTICE files for details. # # SPDX-License-Identifier: BSD-2-Clause diff --git a/python/tests/test-5-operator.py b/python/tests/test-5-operator.py index 7127fe395e..1b67bdab2d 100644 --- a/python/tests/test-5-operator.py +++ b/python/tests/test-5-operator.py @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors +# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors # All Rights Reserved. See the top-level LICENSE and NOTICE files for details. # # SPDX-License-Identifier: BSD-2-Clause diff --git a/python/tests/test-qfunctions.h b/python/tests/test-qfunctions.h index eb9a5f3f1d..5790d540aa 100644 --- a/python/tests/test-qfunctions.h +++ b/python/tests/test-qfunctions.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause diff --git a/rust/libceed-sys/build.rs b/rust/libceed-sys/build.rs index d1cc93be6e..8c510fcc8c 100644 --- a/rust/libceed-sys/build.rs +++ b/rust/libceed-sys/build.rs @@ -14,6 +14,7 @@ fn main() { } else { // Install libceed.a or libceed.so to $OUT_DIR/lib let makeflags = env("CARGO_MAKEFLAGS").unwrap(); + let optflags = env("CARGO_CEED_OPT_FLAGS").unwrap_or_else(|| "".to_string()); let mut make = Command::new("make"); make.arg("install") .arg(format!("prefix={}", out_dir.to_string_lossy())) @@ -28,6 +29,9 @@ fn main() { .arg("FC=") // Don't try to find Fortran (unused library build/install) .env("MAKEFLAGS", makeflags) .current_dir("c-src"); + if optflags.len() > 0 { + make.env("OPT", optflags); + } if statik { make.arg("STATIC=1"); } diff --git a/rust/libceed-sys/src/lib.rs b/rust/libceed-sys/src/lib.rs index 1279fdddf5..21dff0d343 100644 --- a/rust/libceed-sys/src/lib.rs +++ b/rust/libceed-sys/src/lib.rs @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -17,5 +17,7 @@ pub mod bind_ceed { #![allow(non_upper_case_globals)] #![allow(non_camel_case_types)] #![allow(dead_code)] + #![allow(clippy::too_long_first_doc_paragraph)] + #![allow(non_snake_case)] include!(concat!(env!("OUT_DIR"), "/bindings.rs")); } diff --git a/rust/libceed/src/basis.rs b/rust/libceed/src/basis.rs index 4c11fb79b4..7018e0d462 100644 --- a/rust/libceed/src/basis.rs +++ b/rust/libceed/src/basis.rs @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -8,7 +8,7 @@ //! A Ceed Basis defines the discrete finite element basis and associated //! quadrature rule. -use crate::prelude::*; +use crate::{prelude::*, vector::Vector, EvalMode, TransposeMode}; // ----------------------------------------------------------------------------- // Basis option @@ -27,7 +27,7 @@ impl<'a> From<&'a Basis<'_>> for BasisOpt<'a> { } impl<'a> BasisOpt<'a> { /// Transform a Rust libCEED BasisOpt into C libCEED CeedBasis - pub(crate) fn to_raw(self) -> bind_ceed::CeedBasis { + pub(crate) fn to_raw(&self) -> bind_ceed::CeedBasis { match self { Self::Some(basis) => basis.ptr, Self::None => unsafe { bind_ceed::CEED_BASIS_NONE }, @@ -37,7 +37,7 @@ impl<'a> BasisOpt<'a> { /// Check if a BasisOpt is Some /// /// ``` - /// # use libceed::prelude::*; + /// # use libceed::{prelude::*, BasisOpt, QuadMode}; /// # fn main() -> libceed::Result<()> { /// # let ceed = libceed::Ceed::default_init(); /// let b = ceed.basis_tensor_H1_Lagrange(1, 2, 3, 4, QuadMode::Gauss)?; @@ -59,7 +59,7 @@ impl<'a> BasisOpt<'a> { /// Check if a BasisOpt is None /// /// ``` - /// # use libceed::prelude::*; + /// # use libceed::{prelude::*, BasisOpt, QuadMode}; /// # fn main() -> libceed::Result<()> { /// # let ceed = libceed::Ceed::default_init(); /// let b = ceed.basis_tensor_H1_Lagrange(1, 2, 3, 4, QuadMode::Gauss)?; @@ -108,7 +108,7 @@ impl<'a> fmt::Display for Basis<'a> { /// View a Basis /// /// ``` - /// # use libceed::prelude::*; + /// # use libceed::{prelude::*, QuadMode}; /// # fn main() -> libceed::Result<()> { /// # let ceed = libceed::Ceed::default_init(); /// let b = ceed.basis_tensor_H1_Lagrange(1, 2, 3, 4, QuadMode::Gauss)?; @@ -134,6 +134,7 @@ impl<'a> fmt::Display for Basis<'a> { // ----------------------------------------------------------------------------- impl<'a> Basis<'a> { // Constructors + #[allow(clippy::too_many_arguments)] pub fn create_tensor_H1( ceed: &crate::Ceed, dim: usize, @@ -152,7 +153,7 @@ impl<'a> Basis<'a> { i32::try_from(P1d).unwrap(), i32::try_from(Q1d).unwrap(), ); - let ierr = unsafe { + ceed.check_error(unsafe { bind_ceed::CeedBasisCreateTensorH1( ceed.ptr, dim, @@ -165,8 +166,14 @@ impl<'a> Basis<'a> { qweight1d.as_ptr(), &mut ptr, ) - }; - ceed.check_error(ierr)?; + })?; + Ok(Self { + ptr, + _lifeline: PhantomData, + }) + } + + pub(crate) unsafe fn from_raw(ptr: bind_ceed::CeedBasis) -> crate::Result { Ok(Self { ptr, _lifeline: PhantomData, @@ -189,16 +196,16 @@ impl<'a> Basis<'a> { i32::try_from(Q).unwrap(), qmode as bind_ceed::CeedQuadMode, ); - let ierr = unsafe { + ceed.check_error(unsafe { bind_ceed::CeedBasisCreateTensorH1Lagrange(ceed.ptr, dim, ncomp, P, Q, qmode, &mut ptr) - }; - ceed.check_error(ierr)?; + })?; Ok(Self { ptr, _lifeline: PhantomData, }) } + #[allow(clippy::too_many_arguments)] pub fn create_H1( ceed: &crate::Ceed, topo: crate::ElemTopology, @@ -217,7 +224,7 @@ impl<'a> Basis<'a> { i32::try_from(nnodes).unwrap(), i32::try_from(nqpts).unwrap(), ); - let ierr = unsafe { + ceed.check_error(unsafe { bind_ceed::CeedBasisCreateH1( ceed.ptr, topo, @@ -230,14 +237,14 @@ impl<'a> Basis<'a> { qweight.as_ptr(), &mut ptr, ) - }; - ceed.check_error(ierr)?; + })?; Ok(Self { ptr, _lifeline: PhantomData, }) } + #[allow(clippy::too_many_arguments)] pub fn create_Hdiv( ceed: &crate::Ceed, topo: crate::ElemTopology, @@ -256,7 +263,7 @@ impl<'a> Basis<'a> { i32::try_from(nnodes).unwrap(), i32::try_from(nqpts).unwrap(), ); - let ierr = unsafe { + ceed.check_error(unsafe { bind_ceed::CeedBasisCreateHdiv( ceed.ptr, topo, @@ -269,14 +276,14 @@ impl<'a> Basis<'a> { qweight.as_ptr(), &mut ptr, ) - }; - ceed.check_error(ierr)?; + })?; Ok(Self { ptr, _lifeline: PhantomData, }) } + #[allow(clippy::too_many_arguments)] pub fn create_Hcurl( ceed: &crate::Ceed, topo: crate::ElemTopology, @@ -295,7 +302,7 @@ impl<'a> Basis<'a> { i32::try_from(nnodes).unwrap(), i32::try_from(nqpts).unwrap(), ); - let ierr = unsafe { + ceed.check_error(unsafe { bind_ceed::CeedBasisCreateHcurl( ceed.ptr, topo, @@ -308,22 +315,23 @@ impl<'a> Basis<'a> { qweight.as_ptr(), &mut ptr, ) - }; - ceed.check_error(ierr)?; + })?; Ok(Self { ptr, _lifeline: PhantomData, }) } + // Raw Ceed for error handling + #[doc(hidden)] + fn ceed(&self) -> bind_ceed::Ceed { + unsafe { bind_ceed::CeedBasisReturnCeed(self.ptr) } + } + // Error handling #[doc(hidden)] fn check_error(&self, ierr: i32) -> crate::Result { - let mut ptr = std::ptr::null_mut(); - unsafe { - bind_ceed::CeedBasisGetCeed(self.ptr, &mut ptr); - } - crate::check_error(ptr, ierr) + crate::check_error(|| self.ceed(), ierr) } /// Apply basis evaluation from nodes to quadrature points or vice versa @@ -339,7 +347,7 @@ impl<'a> Basis<'a> { /// * `v` - Output Vector /// /// ``` - /// # use libceed::prelude::*; + /// # use libceed::{prelude::*, EvalMode, TransposeMode, QuadMode}; /// # fn main() -> libceed::Result<()> { /// # let ceed = libceed::Ceed::default_init(); /// const Q: usize = 6; @@ -404,15 +412,15 @@ impl<'a> Basis<'a> { tmode as bind_ceed::CeedTransposeMode, emode as bind_ceed::CeedEvalMode, ); - let ierr = - unsafe { bind_ceed::CeedBasisApply(self.ptr, nelem, tmode, emode, u.ptr, v.ptr) }; - self.check_error(ierr) + self.check_error(unsafe { + bind_ceed::CeedBasisApply(self.ptr, nelem, tmode, emode, u.ptr, v.ptr) + }) } /// Returns the dimension for given Basis /// /// ``` - /// # use libceed::prelude::*; + /// # use libceed::{prelude::*, QuadMode}; /// # fn main() -> libceed::Result<()> { /// # let ceed = libceed::Ceed::default_init(); /// let dim = 2; @@ -432,7 +440,7 @@ impl<'a> Basis<'a> { /// Returns number of components for given Basis /// /// ``` - /// # use libceed::prelude::*; + /// # use libceed::{prelude::*, QuadMode}; /// # fn main() -> libceed::Result<()> { /// # let ceed = libceed::Ceed::default_init(); /// let ncomp = 2; @@ -452,7 +460,7 @@ impl<'a> Basis<'a> { /// Returns total number of nodes (in dim dimensions) of a Basis /// /// ``` - /// # use libceed::prelude::*; + /// # use libceed::{prelude::*, QuadMode}; /// # fn main() -> libceed::Result<()> { /// # let ceed = libceed::Ceed::default_init(); /// let p = 3; @@ -473,7 +481,7 @@ impl<'a> Basis<'a> { /// Basis /// /// ``` - /// # use libceed::prelude::*; + /// # use libceed::{prelude::*, QuadMode}; /// # fn main() -> libceed::Result<()> { /// # let ceed = libceed::Ceed::default_init(); /// let q = 4; @@ -502,7 +510,7 @@ impl<'a> Basis<'a> { /// points and weights. /// /// ``` - /// # use libceed::prelude::*; + /// # use libceed::{prelude::*, EvalMode, TransposeMode, QuadMode}; /// # fn main() -> libceed::Result<()> { /// # let ceed = libceed::Ceed::default_init(); /// let coarse = ceed.basis_tensor_H1_Lagrange(1, 1, 2, 3, QuadMode::Gauss)?; @@ -523,8 +531,9 @@ impl<'a> Basis<'a> { /// ``` pub fn create_projection(&self, to: &Self) -> crate::Result { let mut ptr = std::ptr::null_mut(); - let ierr = unsafe { bind_ceed::CeedBasisCreateProjection(self.ptr, to.ptr, &mut ptr) }; - self.check_error(ierr)?; + self.check_error(unsafe { + bind_ceed::CeedBasisCreateProjection(self.ptr, to.ptr, &mut ptr) + })?; Ok(Self { ptr, _lifeline: PhantomData, diff --git a/rust/libceed/src/elem_restriction.rs b/rust/libceed/src/elem_restriction.rs index 950a840403..d251220ff7 100644 --- a/rust/libceed/src/elem_restriction.rs +++ b/rust/libceed/src/elem_restriction.rs @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -8,7 +8,7 @@ //! A Ceed ElemRestriction decomposes elements and groups the degrees of freedom //! (dofs) according to the different elements they belong to. -use crate::prelude::*; +use crate::{prelude::*, vector::Vector, TransposeMode}; // ----------------------------------------------------------------------------- // ElemRestriction option @@ -28,7 +28,7 @@ impl<'a> From<&'a ElemRestriction<'_>> for ElemRestrictionOpt<'a> { impl<'a> ElemRestrictionOpt<'a> { /// Transform a Rust libCEED ElemRestrictionOpt into C libCEED /// CeedElemRestriction - pub(crate) fn to_raw(self) -> bind_ceed::CeedElemRestriction { + pub(crate) fn to_raw(&self) -> bind_ceed::CeedElemRestriction { match self { Self::Some(rstr) => rstr.ptr, Self::None => unsafe { bind_ceed::CEED_ELEMRESTRICTION_NONE }, @@ -38,7 +38,7 @@ impl<'a> ElemRestrictionOpt<'a> { /// Check if an ElemRestrictionOpt is Some /// /// ``` - /// # use libceed::prelude::*; + /// # use libceed::{prelude::*, ElemRestrictionOpt, MemType}; /// # fn main() -> libceed::Result<()> { /// # let ceed = libceed::Ceed::default_init(); /// let nelem = 3; @@ -66,7 +66,7 @@ impl<'a> ElemRestrictionOpt<'a> { /// Check if an ElemRestrictionOpt is None /// /// ``` - /// # use libceed::prelude::*; + /// # use libceed::{prelude::*, ElemRestrictionOpt, MemType}; /// # fn main() -> libceed::Result<()> { /// # let ceed = libceed::Ceed::default_init(); /// let nelem = 3; @@ -121,7 +121,7 @@ impl<'a> fmt::Display for ElemRestriction<'a> { /// View an ElemRestriction /// /// ``` - /// # use libceed::prelude::*; + /// # use libceed::{prelude::*, MemType}; /// # fn main() -> libceed::Result<()> { /// # let ceed = libceed::Ceed::default_init(); /// let nelem = 3; @@ -153,6 +153,7 @@ impl<'a> fmt::Display for ElemRestriction<'a> { // ----------------------------------------------------------------------------- impl<'a> ElemRestriction<'a> { // Constructors + #[allow(clippy::too_many_arguments)] pub fn create( ceed: &crate::Ceed, nelem: usize, @@ -172,7 +173,7 @@ impl<'a> ElemRestriction<'a> { isize::try_from(lsize).unwrap(), mtype as bind_ceed::CeedMemType, ); - let ierr = unsafe { + ceed.check_error(unsafe { bind_ceed::CeedElemRestrictionCreate( ceed.ptr, nelem, @@ -185,14 +186,21 @@ impl<'a> ElemRestriction<'a> { offsets.as_ptr(), &mut ptr, ) - }; - ceed.check_error(ierr)?; + })?; + Ok(Self { + ptr, + _lifeline: PhantomData, + }) + } + + pub(crate) unsafe fn from_raw(ptr: bind_ceed::CeedElemRestriction) -> crate::Result { Ok(Self { ptr, _lifeline: PhantomData, }) } + #[allow(clippy::too_many_arguments)] pub fn create_oriented( ceed: &crate::Ceed, nelem: usize, @@ -213,7 +221,7 @@ impl<'a> ElemRestriction<'a> { isize::try_from(lsize).unwrap(), mtype as bind_ceed::CeedMemType, ); - let ierr = unsafe { + ceed.check_error(unsafe { bind_ceed::CeedElemRestrictionCreateOriented( ceed.ptr, nelem, @@ -227,14 +235,14 @@ impl<'a> ElemRestriction<'a> { orients.as_ptr(), &mut ptr, ) - }; - ceed.check_error(ierr)?; + })?; Ok(Self { ptr, _lifeline: PhantomData, }) } + #[allow(clippy::too_many_arguments)] pub fn create_curl_oriented( ceed: &crate::Ceed, nelem: usize, @@ -255,7 +263,7 @@ impl<'a> ElemRestriction<'a> { isize::try_from(lsize).unwrap(), mtype as bind_ceed::CeedMemType, ); - let ierr = unsafe { + ceed.check_error(unsafe { bind_ceed::CeedElemRestrictionCreateCurlOriented( ceed.ptr, nelem, @@ -269,8 +277,7 @@ impl<'a> ElemRestriction<'a> { curlorients.as_ptr(), &mut ptr, ) - }; - ceed.check_error(ierr)?; + })?; Ok(Self { ptr, _lifeline: PhantomData, @@ -292,7 +299,7 @@ impl<'a> ElemRestriction<'a> { i32::try_from(ncomp).unwrap(), isize::try_from(lsize).unwrap(), ); - let ierr = unsafe { + ceed.check_error(unsafe { bind_ceed::CeedElemRestrictionCreateStrided( ceed.ptr, nelem, @@ -302,28 +309,29 @@ impl<'a> ElemRestriction<'a> { strides.as_ptr(), &mut ptr, ) - }; - ceed.check_error(ierr)?; + })?; Ok(Self { ptr, _lifeline: PhantomData, }) } + // Raw Ceed for error handling + #[doc(hidden)] + fn ceed(&self) -> bind_ceed::Ceed { + unsafe { bind_ceed::CeedElemRestrictionReturnCeed(self.ptr) } + } + // Error handling #[doc(hidden)] fn check_error(&self, ierr: i32) -> crate::Result { - let mut ptr = std::ptr::null_mut(); - unsafe { - bind_ceed::CeedElemRestrictionGetCeed(self.ptr, &mut ptr); - } - crate::check_error(ptr, ierr) + crate::check_error(|| self.ceed(), ierr) } /// Create an Lvector for an ElemRestriction /// /// ``` - /// # use libceed::prelude::*; + /// # use libceed::{prelude::*, MemType}; /// # fn main() -> libceed::Result<()> { /// # let ceed = libceed::Ceed::default_init(); /// let nelem = 3; @@ -343,16 +351,16 @@ impl<'a> ElemRestriction<'a> { pub fn create_lvector<'b>(&self) -> crate::Result> { let mut ptr_lvector = std::ptr::null_mut(); let null = std::ptr::null_mut() as *mut _; - let ierr = - unsafe { bind_ceed::CeedElemRestrictionCreateVector(self.ptr, &mut ptr_lvector, null) }; - self.check_error(ierr)?; - Vector::from_raw(ptr_lvector) + self.check_error(unsafe { + bind_ceed::CeedElemRestrictionCreateVector(self.ptr, &mut ptr_lvector, null) + })?; + unsafe { Vector::from_raw(ptr_lvector) } } /// Create an Evector for an ElemRestriction /// /// ``` - /// # use libceed::prelude::*; + /// # use libceed::{prelude::*, MemType}; /// # fn main() -> libceed::Result<()> { /// # let ceed = libceed::Ceed::default_init(); /// let nelem = 3; @@ -372,16 +380,16 @@ impl<'a> ElemRestriction<'a> { pub fn create_evector<'b>(&self) -> crate::Result> { let mut ptr_evector = std::ptr::null_mut(); let null = std::ptr::null_mut() as *mut _; - let ierr = - unsafe { bind_ceed::CeedElemRestrictionCreateVector(self.ptr, null, &mut ptr_evector) }; - self.check_error(ierr)?; - Vector::from_raw(ptr_evector) + self.check_error(unsafe { + bind_ceed::CeedElemRestrictionCreateVector(self.ptr, null, &mut ptr_evector) + })?; + unsafe { Vector::from_raw(ptr_evector) } } /// Create Vectors for an ElemRestriction /// /// ``` - /// # use libceed::prelude::*; + /// # use libceed::{prelude::*, MemType}; /// # fn main() -> libceed::Result<()> { /// # let ceed = libceed::Ceed::default_init(); /// let nelem = 3; @@ -402,12 +410,11 @@ impl<'a> ElemRestriction<'a> { pub fn create_vectors<'b, 'c>(&self) -> crate::Result<(Vector<'b>, Vector<'c>)> { let mut ptr_lvector = std::ptr::null_mut(); let mut ptr_evector = std::ptr::null_mut(); - let ierr = unsafe { + self.check_error(unsafe { bind_ceed::CeedElemRestrictionCreateVector(self.ptr, &mut ptr_lvector, &mut ptr_evector) - }; - self.check_error(ierr)?; - let lvector = Vector::from_raw(ptr_lvector)?; - let evector = Vector::from_raw(ptr_evector)?; + })?; + let lvector = unsafe { Vector::from_raw(ptr_lvector)? }; + let evector = unsafe { Vector::from_raw(ptr_evector)? }; Ok((lvector, evector)) } @@ -422,7 +429,7 @@ impl<'a> ElemRestriction<'a> { /// decided by the backend. /// /// ``` - /// # use libceed::prelude::*; + /// # use libceed::{prelude::*, MemType, Scalar, TransposeMode}; /// # fn main() -> libceed::Result<()> { /// # let ceed = libceed::Ceed::default_init(); /// let nelem = 3; @@ -451,7 +458,7 @@ impl<'a> ElemRestriction<'a> { /// ``` pub fn apply(&self, tmode: TransposeMode, u: &Vector, ru: &mut Vector) -> crate::Result { let tmode = tmode as bind_ceed::CeedTransposeMode; - let ierr = unsafe { + self.check_error(unsafe { bind_ceed::CeedElemRestrictionApply( self.ptr, tmode, @@ -459,14 +466,13 @@ impl<'a> ElemRestriction<'a> { ru.ptr, bind_ceed::CEED_REQUEST_IMMEDIATE, ) - }; - self.check_error(ierr) + }) } /// Returns the Lvector component stride /// /// ``` - /// # use libceed::prelude::*; + /// # use libceed::{prelude::*, MemType}; /// # fn main() -> libceed::Result<()> { /// # let ceed = libceed::Ceed::default_init(); /// let nelem = 3; @@ -492,7 +498,7 @@ impl<'a> ElemRestriction<'a> { /// Returns the total number of elements in the range of a ElemRestriction /// /// ``` - /// # use libceed::prelude::*; + /// # use libceed::{prelude::*, MemType}; /// # fn main() -> libceed::Result<()> { /// # let ceed = libceed::Ceed::default_init(); /// let nelem = 3; @@ -517,7 +523,7 @@ impl<'a> ElemRestriction<'a> { /// Returns the size of elements in the ElemRestriction /// /// ``` - /// # use libceed::prelude::*; + /// # use libceed::{prelude::*, MemType}; /// # fn main() -> libceed::Result<()> { /// # let ceed = libceed::Ceed::default_init(); /// let nelem = 3; @@ -543,7 +549,7 @@ impl<'a> ElemRestriction<'a> { /// Returns the size of the Lvector for an ElemRestriction /// /// ``` - /// # use libceed::prelude::*; + /// # use libceed::{prelude::*, MemType}; /// # fn main() -> libceed::Result<()> { /// # let ceed = libceed::Ceed::default_init(); /// let nelem = 3; @@ -568,7 +574,7 @@ impl<'a> ElemRestriction<'a> { /// Returns the number of components in the elements of an ElemRestriction /// /// ``` - /// # use libceed::prelude::*; + /// # use libceed::{prelude::*, MemType}; /// # fn main() -> libceed::Result<()> { /// # let ceed = libceed::Ceed::default_init(); /// let nelem = 3; @@ -594,7 +600,7 @@ impl<'a> ElemRestriction<'a> { /// Returns the multiplicity of nodes in an ElemRestriction /// /// ``` - /// # use libceed::prelude::*; + /// # use libceed::{prelude::*, MemType}; /// # fn main() -> libceed::Result<()> { /// # let ceed = libceed::Ceed::default_init(); /// let nelem = 3; @@ -621,8 +627,9 @@ impl<'a> ElemRestriction<'a> { /// # } /// ``` pub fn multiplicity(&self, mult: &mut Vector) -> crate::Result { - let ierr = unsafe { bind_ceed::CeedElemRestrictionGetMultiplicity(self.ptr, mult.ptr) }; - self.check_error(ierr) + self.check_error(unsafe { + bind_ceed::CeedElemRestrictionGetMultiplicity(self.ptr, mult.ptr) + }) } } diff --git a/rust/libceed/src/lib.rs b/rust/libceed/src/lib.rs index bf7de98e07..ae6487ee83 100755 --- a/rust/libceed/src/lib.rs +++ b/rust/libceed/src/lib.rs @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -19,18 +19,6 @@ use crate::prelude::*; use std::sync::Once; pub mod prelude { - pub use crate::{ - basis::{self, Basis, BasisOpt}, - elem_restriction::{self, ElemRestriction, ElemRestrictionOpt}, - operator::{self, CompositeOperator, Operator, OperatorField}, - qfunction::{ - self, QFunction, QFunctionByName, QFunctionField, QFunctionInputs, QFunctionOpt, - QFunctionOutputs, - }, - vector::{self, Vector, VectorOpt, VectorSliceWrapper}, - ElemTopology, EvalMode, MemType, NormType, QuadMode, Scalar, TransposeMode, - CEED_STRIDES_BACKEND, EPSILON, MAX_QFUNCTION_FIELDS, - }; pub(crate) use libceed_sys::bind_ceed; pub(crate) use std::convert::TryFrom; pub(crate) use std::ffi::{CStr, CString}; @@ -157,11 +145,27 @@ impl fmt::Display for Error { } } +// ----------------------------------------------------------------------------- +// Internal crate contents +// ----------------------------------------------------------------------------- +pub use crate::{ + basis::{Basis, BasisOpt}, + elem_restriction::{ElemRestriction, ElemRestrictionOpt}, + operator::{CompositeOperator, Operator, OperatorField}, + qfunction::{ + QFunction, QFunctionByName, QFunctionField, QFunctionInputs, QFunctionOpt, QFunctionOutputs, + }, + vector::{Vector, VectorOpt, VectorSliceWrapper}, +}; + // ----------------------------------------------------------------------------- // Internal error checker // ----------------------------------------------------------------------------- #[doc(hidden)] -pub(crate) fn check_error(ceed_ptr: bind_ceed::Ceed, ierr: i32) -> Result { +pub(crate) fn check_error(ceed_ptr: F, ierr: i32) -> Result +where + F: FnOnce() -> bind_ceed::Ceed, +{ // Return early if code is clean if ierr == bind_ceed::CeedErrorType_CEED_ERROR_SUCCESS { return Ok(ierr); @@ -169,7 +173,7 @@ pub(crate) fn check_error(ceed_ptr: bind_ceed::Ceed, ierr: i32) -> Result { // Retrieve error message let mut ptr: *const std::os::raw::c_char = std::ptr::null_mut(); let c_str = unsafe { - bind_ceed::CeedGetErrorMessage(ceed_ptr, &mut ptr); + bind_ceed::CeedGetErrorMessage(ceed_ptr(), &mut ptr); std::ffi::CStr::from_ptr(ptr) }; let message = c_str.to_string_lossy().to_string(); @@ -225,8 +229,8 @@ impl Clone for Ceed { /// ``` fn clone(&self) -> Self { let mut ptr_clone = std::ptr::null_mut(); - let ierr = unsafe { bind_ceed::CeedReferenceCopy(self.ptr, &mut ptr_clone) }; - self.check_error(ierr).expect("failed to clone Ceed"); + self.check_error(unsafe { bind_ceed::CeedReferenceCopy(self.ptr, &mut ptr_clone) }) + .expect("failed to clone Ceed"); Self { ptr: ptr_clone } } } @@ -305,7 +309,7 @@ impl Ceed { // Call to libCEED let mut ptr = std::ptr::null_mut(); - let mut ierr = unsafe { bind_ceed::CeedInit(c_resource.as_ptr() as *const i8, &mut ptr) }; + let mut ierr = unsafe { bind_ceed::CeedInit(c_resource.as_ptr(), &mut ptr) }; if ierr != 0 { panic!("Error initializing backend resource: {}", resource) } @@ -424,7 +428,7 @@ impl Ceed { /// `[0, lsize - 1]`. /// /// ``` - /// # use libceed::prelude::*; + /// # use libceed::{prelude::*, MemType}; /// # fn main() -> libceed::Result<()> { /// # let ceed = libceed::Ceed::default_init(); /// let nelem = 3; @@ -437,6 +441,7 @@ impl Ceed { /// # Ok(()) /// # } /// ``` + #[allow(clippy::too_many_arguments)] pub fn elem_restriction<'a>( &self, nelem: usize, @@ -483,7 +488,7 @@ impl Ceed { /// orientation. /// /// ``` - /// # use libceed::prelude::*; + /// # use libceed::{prelude::*, MemType}; /// # fn main() -> libceed::Result<()> { /// # let ceed = libceed::Ceed::default_init(); /// let nelem = 3; @@ -500,6 +505,7 @@ impl Ceed { /// # Ok(()) /// # } /// ``` + #[allow(clippy::too_many_arguments)] pub fn oriented_elem_restriction<'a>( &self, nelem: usize, @@ -547,7 +553,7 @@ impl Ceed { /// unknowns upon restriction. /// /// ``` - /// # use libceed::prelude::*; + /// # use libceed::{prelude::*, MemType}; /// # fn main() -> libceed::Result<()> { /// # let ceed = libceed::Ceed::default_init(); /// let nelem = 3; @@ -586,6 +592,7 @@ impl Ceed { /// # Ok(()) /// # } /// ``` + #[allow(clippy::too_many_arguments)] pub fn curl_oriented_elem_restriction<'a>( &self, nelem: usize, @@ -687,6 +694,7 @@ impl Ceed { /// # Ok(()) /// # } /// ``` + #[allow(clippy::too_many_arguments)] pub fn basis_tensor_H1<'a>( &self, dim: usize, @@ -716,7 +724,7 @@ impl Ceed { /// accuracy for the quadrature) /// /// ``` - /// # use libceed::prelude::*; + /// # use libceed::{prelude::*, QuadMode}; /// # fn main() -> libceed::Result<()> { /// # let ceed = libceed::Ceed::default_init(); /// let b = ceed.basis_tensor_H1_Lagrange(2, 1, 3, 4, QuadMode::Gauss)?; @@ -752,7 +760,7 @@ impl Ceed { /// the reference element /// /// ``` - /// # use libceed::prelude::*; + /// # use libceed::{prelude::*, ElemTopology}; /// # fn main() -> libceed::Result<()> { /// # let ceed = libceed::Ceed::default_init(); /// let interp = [ @@ -849,6 +857,7 @@ impl Ceed { /// # Ok(()) /// # } /// ``` + #[allow(clippy::too_many_arguments)] pub fn basis_H1<'a>( &self, topo: ElemTopology, @@ -883,7 +892,7 @@ impl Ceed { /// the reference element /// /// ``` - /// # use libceed::prelude::*; + /// # use libceed::{prelude::*, ElemTopology}; /// # fn main() -> libceed::Result<()> { /// # let ceed = libceed::Ceed::default_init(); /// let interp = [ @@ -947,6 +956,7 @@ impl Ceed { /// # Ok(()) /// # } /// ``` + #[allow(clippy::too_many_arguments)] pub fn basis_Hdiv<'a>( &self, topo: ElemTopology, @@ -980,7 +990,7 @@ impl Ceed { /// the reference element /// /// ``` - /// # use libceed::prelude::*; + /// # use libceed::{prelude::*, ElemTopology}; /// # fn main() -> libceed::Result<()> { /// # let ceed = libceed::Ceed::default_init(); /// let interp = [ @@ -1041,6 +1051,7 @@ impl Ceed { /// # Ok(()) /// # } /// ``` + #[allow(clippy::too_many_arguments)] pub fn basis_Hcurl<'a>( &self, topo: ElemTopology, @@ -1074,7 +1085,7 @@ impl Ceed { /// * `f` - Boxed closure to evaluate weak form at quadrature points. /// /// ``` - /// # use libceed::prelude::*; + /// # use libceed::{prelude::*, QFunctionInputs, QFunctionOutputs}; /// # fn main() -> libceed::Result<()> { /// # let ceed = libceed::Ceed::default_init(); /// let mut user_f = |[u, weights, ..]: QFunctionInputs, [v, ..]: QFunctionOutputs| { @@ -1132,7 +1143,7 @@ impl Ceed { /// Jacobian of the qf (or qfunction_none) /// /// ``` - /// # use libceed::prelude::*; + /// # use libceed::{prelude::*, QFunctionOpt}; /// # fn main() -> libceed::Result<()> { /// # let ceed = libceed::Ceed::default_init(); /// let qf = ceed.q_function_interior_by_name("Mass1DBuild")?; diff --git a/rust/libceed/src/operator.rs b/rust/libceed/src/operator.rs index 91e6b6f4a4..fae468d3c9 100644 --- a/rust/libceed/src/operator.rs +++ b/rust/libceed/src/operator.rs @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -9,7 +9,13 @@ //! Ceed QFunction. A Ceed Operator connects Ceed ElemRestrictions, //! Ceed Bases, and Ceed QFunctions. -use crate::prelude::*; +use crate::{ + basis::{Basis, BasisOpt}, + elem_restriction::{ElemRestriction, ElemRestrictionOpt}, + prelude::*, + qfunction::QFunctionOpt, + vector::{Vector, VectorOpt}, +}; // ----------------------------------------------------------------------------- // Operator Field context wrapper @@ -17,6 +23,9 @@ use crate::prelude::*; #[derive(Debug)] pub struct OperatorField<'a> { pub(crate) ptr: bind_ceed::CeedOperatorField, + pub(crate) vector: crate::Vector<'a>, + pub(crate) elem_restriction: crate::ElemRestriction<'a>, + pub(crate) basis: crate::Basis<'a>, _lifeline: PhantomData<&'a ()>, } @@ -24,17 +33,48 @@ pub struct OperatorField<'a> { // Implementations // ----------------------------------------------------------------------------- impl<'a> OperatorField<'a> { + pub(crate) unsafe fn from_raw( + ptr: bind_ceed::CeedOperatorField, + ceed: crate::Ceed, + ) -> crate::Result { + let vector = { + let mut vector_ptr = std::ptr::null_mut(); + ceed.check_error(bind_ceed::CeedOperatorFieldGetVector(ptr, &mut vector_ptr))?; + crate::Vector::from_raw(vector_ptr)? + }; + let elem_restriction = { + let mut elem_restriction_ptr = std::ptr::null_mut(); + ceed.check_error(bind_ceed::CeedOperatorFieldGetElemRestriction( + ptr, + &mut elem_restriction_ptr, + ))?; + crate::ElemRestriction::from_raw(elem_restriction_ptr)? + }; + let basis = { + let mut basis_ptr = std::ptr::null_mut(); + ceed.check_error(bind_ceed::CeedOperatorFieldGetBasis(ptr, &mut basis_ptr))?; + crate::Basis::from_raw(basis_ptr)? + }; + Ok(Self { + ptr, + vector, + elem_restriction, + basis, + _lifeline: PhantomData, + }) + } + /// Get the name of an OperatorField /// /// ``` - /// # use libceed::prelude::*; + /// # use libceed::{prelude::*, BasisOpt, ElemRestrictionOpt, MemType, QFunctionOpt, QuadMode, VectorOpt}; /// # fn main() -> libceed::Result<()> { /// # let ceed = libceed::Ceed::default_init(); /// let qf = ceed.q_function_interior_by_name("Mass1DBuild")?; /// /// // Operator field arguments /// let ne = 3; - /// let q = 4 as usize; + /// let q = 4_usize; /// let mut ind: Vec = vec![0; 2 * ne]; /// for i in 0..ne { /// ind[2 * i + 0] = i as i32; @@ -74,14 +114,14 @@ impl<'a> OperatorField<'a> { /// Get the ElemRestriction of an OperatorField /// /// ``` - /// # use libceed::prelude::*; + /// # use libceed::{prelude::*, BasisOpt, ElemRestrictionOpt, MemType, QFunctionOpt, QuadMode, VectorOpt}; /// # fn main() -> libceed::Result<()> { /// # let ceed = libceed::Ceed::default_init(); /// let qf = ceed.q_function_interior_by_name("Mass1DBuild")?; /// /// // Operator field arguments /// let ne = 3; - /// let q = 4 as usize; + /// let q = 4_usize; /// let mut ind: Vec = vec![0; 2 * ne]; /// for i in 0..ne { /// ind[2 * i + 0] = i as i32; @@ -106,42 +146,54 @@ impl<'a> OperatorField<'a> { /// inputs[0].elem_restriction().is_some(), /// "Incorrect field ElemRestriction" /// ); + /// if let ElemRestrictionOpt::Some(r) = inputs[0].elem_restriction() { + /// assert_eq!( + /// r.num_elements(), + /// ne, + /// "Incorrect field ElemRestriction number of elements" + /// ); + /// } + /// /// assert!( /// inputs[1].elem_restriction().is_none(), /// "Incorrect field ElemRestriction" /// ); + /// + /// let outputs = op.outputs()?; + /// + /// assert!( + /// outputs[0].elem_restriction().is_some(), + /// "Incorrect field ElemRestriction" + /// ); + /// if let ElemRestrictionOpt::Some(r) = outputs[0].elem_restriction() { + /// assert_eq!( + /// r.num_elements(), + /// ne, + /// "Incorrect field ElemRestriction number of elements" + /// ); + /// } /// # Ok(()) /// # } /// ``` - pub fn elem_restriction(&self) -> ElemRestrictionOpt { - let mut ptr = std::ptr::null_mut(); - unsafe { - bind_ceed::CeedOperatorFieldGetElemRestriction(self.ptr, &mut ptr); - } - if ptr == unsafe { bind_ceed::CEED_ELEMRESTRICTION_NONE } { + pub fn elem_restriction(&self) -> ElemRestrictionOpt<'_> { + if self.elem_restriction.ptr == unsafe { bind_ceed::CEED_ELEMRESTRICTION_NONE } { ElemRestrictionOpt::None } else { - let slice = unsafe { - std::slice::from_raw_parts( - &ptr as *const bind_ceed::CeedElemRestriction as *const crate::ElemRestriction, - 1 as usize, - ) - }; - ElemRestrictionOpt::Some(&slice[0]) + ElemRestrictionOpt::Some(&self.elem_restriction) } } /// Get the Basis of an OperatorField /// /// ``` - /// # use libceed::prelude::*; + /// # use libceed::{prelude::*, BasisOpt, ElemRestrictionOpt, MemType, QFunctionOpt, QuadMode, VectorOpt}; /// # fn main() -> libceed::Result<()> { /// # let ceed = libceed::Ceed::default_init(); /// let qf = ceed.q_function_interior_by_name("Mass1DBuild")?; /// /// // Operator field arguments /// let ne = 3; - /// let q = 4 as usize; + /// let q = 4_usize; /// let mut ind: Vec = vec![0; 2 * ne]; /// for i in 0..ne { /// ind[2 * i + 0] = i as i32; @@ -163,7 +215,21 @@ impl<'a> OperatorField<'a> { /// let inputs = op.inputs()?; /// /// assert!(inputs[0].basis().is_some(), "Incorrect field Basis"); + /// if let BasisOpt::Some(b) = inputs[0].basis() { + /// assert_eq!( + /// b.num_quadrature_points(), + /// q, + /// "Incorrect field Basis number of quadrature points" + /// ); + /// } /// assert!(inputs[1].basis().is_some(), "Incorrect field Basis"); + /// if let BasisOpt::Some(b) = inputs[1].basis() { + /// assert_eq!( + /// b.num_quadrature_points(), + /// q, + /// "Incorrect field Basis number of quadrature points" + /// ); + /// } /// /// let outputs = op.outputs()?; /// @@ -171,35 +237,25 @@ impl<'a> OperatorField<'a> { /// # Ok(()) /// # } /// ``` - pub fn basis(&self) -> BasisOpt { - let mut ptr = std::ptr::null_mut(); - unsafe { - bind_ceed::CeedOperatorFieldGetBasis(self.ptr, &mut ptr); - } - if ptr == unsafe { bind_ceed::CEED_BASIS_NONE } { + pub fn basis(&self) -> BasisOpt<'_> { + if self.basis.ptr == unsafe { bind_ceed::CEED_BASIS_NONE } { BasisOpt::None } else { - let slice = unsafe { - std::slice::from_raw_parts( - &ptr as *const bind_ceed::CeedBasis as *const crate::Basis, - 1 as usize, - ) - }; - BasisOpt::Some(&slice[0]) + BasisOpt::Some(&self.basis) } } /// Get the Vector of an OperatorField /// /// ``` - /// # use libceed::prelude::*; + /// # use libceed::{prelude::*, BasisOpt, ElemRestrictionOpt, MemType, QFunctionOpt, QuadMode, VectorOpt}; /// # fn main() -> libceed::Result<()> { /// # let ceed = libceed::Ceed::default_init(); /// let qf = ceed.q_function_interior_by_name("Mass1DBuild")?; /// /// // Operator field arguments /// let ne = 3; - /// let q = 4 as usize; + /// let q = 4_usize; /// let mut ind: Vec = vec![0; 2 * ne]; /// for i in 0..ne { /// ind[2 * i + 0] = i as i32; @@ -222,26 +278,20 @@ impl<'a> OperatorField<'a> { /// /// assert!(inputs[0].vector().is_active(), "Incorrect field Vector"); /// assert!(inputs[1].vector().is_none(), "Incorrect field Vector"); + /// + /// let outputs = op.outputs()?; + /// + /// assert!(outputs[0].vector().is_active(), "Incorrect field Vector"); /// # Ok(()) /// # } /// ``` - pub fn vector(&self) -> VectorOpt { - let mut ptr = std::ptr::null_mut(); - unsafe { - bind_ceed::CeedOperatorFieldGetVector(self.ptr, &mut ptr); - } - if ptr == unsafe { bind_ceed::CEED_VECTOR_ACTIVE } { + pub fn vector(&self) -> VectorOpt<'_> { + if self.vector.ptr == unsafe { bind_ceed::CEED_VECTOR_ACTIVE } { VectorOpt::Active - } else if ptr == unsafe { bind_ceed::CEED_VECTOR_NONE } { + } else if self.vector.ptr == unsafe { bind_ceed::CEED_VECTOR_NONE } { VectorOpt::None } else { - let slice = unsafe { - std::slice::from_raw_parts( - &ptr as *const bind_ceed::CeedVector as *const crate::Vector, - 1 as usize, - ) - }; - VectorOpt::Some(&slice[0]) + VectorOpt::Some(&self.vector) } } } @@ -296,14 +346,14 @@ impl<'a> fmt::Display for OperatorCore<'a> { /// View an Operator /// /// ``` -/// # use libceed::prelude::*; +/// # use libceed::{prelude::*, BasisOpt, ElemRestrictionOpt, MemType, QFunctionOpt, QuadMode, VectorOpt}; /// # fn main() -> libceed::Result<()> { /// # let ceed = libceed::Ceed::default_init(); /// let qf = ceed.q_function_interior_by_name("Mass1DBuild")?; /// /// // Operator field arguments /// let ne = 3; -/// let q = 4 as usize; +/// let q = 4_usize; /// let mut ind: Vec = vec![0; 2 * ne]; /// for i in 0..ne { /// ind[2 * i + 0] = i as i32; @@ -336,13 +386,13 @@ impl<'a> fmt::Display for Operator<'a> { /// View a composite Operator /// /// ``` -/// # use libceed::prelude::*; +/// # use libceed::{prelude::*, BasisOpt, ElemRestrictionOpt, MemType, QFunctionOpt, QuadMode, VectorOpt}; /// # fn main() -> libceed::Result<()> { /// # let ceed = libceed::Ceed::default_init(); /// /// // Sub operator field arguments /// let ne = 3; -/// let q = 4 as usize; +/// let q = 4_usize; /// let mut ind: Vec = vec![0; 2 * ne]; /// for i in 0..ne { /// ind[2 * i + 0] = i as i32; @@ -393,100 +443,94 @@ impl<'a> fmt::Display for CompositeOperator<'a> { // Core functionality // ----------------------------------------------------------------------------- impl<'a> OperatorCore<'a> { + // Raw Ceed for error handling + #[doc(hidden)] + fn ceed(&self) -> bind_ceed::Ceed { + unsafe { bind_ceed::CeedOperatorReturnCeed(self.ptr) } + } + // Error handling #[doc(hidden)] fn check_error(&self, ierr: i32) -> crate::Result { - let mut ptr = std::ptr::null_mut(); - unsafe { - bind_ceed::CeedOperatorGetCeed(self.ptr, &mut ptr); - } - crate::check_error(ptr, ierr) + crate::check_error(|| self.ceed(), ierr) } // Common implementations pub fn check(&self) -> crate::Result { - let ierr = unsafe { bind_ceed::CeedOperatorCheckReady(self.ptr) }; - self.check_error(ierr) + self.check_error(unsafe { bind_ceed::CeedOperatorCheckReady(self.ptr) }) } pub fn name(&self, name: &str) -> crate::Result { let name_c = CString::new(name).expect("CString::new failed"); - let ierr = unsafe { bind_ceed::CeedOperatorSetName(self.ptr, name_c.as_ptr()) }; - self.check_error(ierr) + self.check_error(unsafe { bind_ceed::CeedOperatorSetName(self.ptr, name_c.as_ptr()) }) } pub fn apply(&self, input: &Vector, output: &mut Vector) -> crate::Result { - let ierr = unsafe { + self.check_error(unsafe { bind_ceed::CeedOperatorApply( self.ptr, input.ptr, output.ptr, bind_ceed::CEED_REQUEST_IMMEDIATE, ) - }; - self.check_error(ierr) + }) } pub fn apply_add(&self, input: &Vector, output: &mut Vector) -> crate::Result { - let ierr = unsafe { + self.check_error(unsafe { bind_ceed::CeedOperatorApplyAdd( self.ptr, input.ptr, output.ptr, bind_ceed::CEED_REQUEST_IMMEDIATE, ) - }; - self.check_error(ierr) + }) } pub fn linear_assemble_diagonal(&self, assembled: &mut Vector) -> crate::Result { - let ierr = unsafe { + self.check_error(unsafe { bind_ceed::CeedOperatorLinearAssembleDiagonal( self.ptr, assembled.ptr, bind_ceed::CEED_REQUEST_IMMEDIATE, ) - }; - self.check_error(ierr) + }) } pub fn linear_assemble_add_diagonal(&self, assembled: &mut Vector) -> crate::Result { - let ierr = unsafe { + self.check_error(unsafe { bind_ceed::CeedOperatorLinearAssembleAddDiagonal( self.ptr, assembled.ptr, bind_ceed::CEED_REQUEST_IMMEDIATE, ) - }; - self.check_error(ierr) + }) } pub fn linear_assemble_point_block_diagonal( &self, assembled: &mut Vector, ) -> crate::Result { - let ierr = unsafe { + self.check_error(unsafe { bind_ceed::CeedOperatorLinearAssemblePointBlockDiagonal( self.ptr, assembled.ptr, bind_ceed::CEED_REQUEST_IMMEDIATE, ) - }; - self.check_error(ierr) + }) } pub fn linear_assemble_add_point_block_diagonal( &self, assembled: &mut Vector, ) -> crate::Result { - let ierr = unsafe { + self.check_error(unsafe { bind_ceed::CeedOperatorLinearAssembleAddPointBlockDiagonal( self.ptr, assembled.ptr, bind_ceed::CEED_REQUEST_IMMEDIATE, ) - }; - self.check_error(ierr) + }) } } @@ -502,7 +546,7 @@ impl<'a> Operator<'a> { dqfT: impl Into>, ) -> crate::Result { let mut ptr = std::ptr::null_mut(); - let ierr = unsafe { + ceed.check_error(unsafe { bind_ceed::CeedOperatorCreate( ceed.ptr, qf.into().to_raw(), @@ -510,8 +554,7 @@ impl<'a> Operator<'a> { dqfT.into().to_raw(), &mut ptr, ) - }; - ceed.check_error(ierr)?; + })?; Ok(Self { op_core: OperatorCore { ptr, @@ -520,7 +563,7 @@ impl<'a> Operator<'a> { }) } - fn from_raw(ptr: bind_ceed::CeedOperator) -> crate::Result { + unsafe fn from_raw(ptr: bind_ceed::CeedOperator) -> crate::Result { Ok(Self { op_core: OperatorCore { ptr, @@ -534,14 +577,14 @@ impl<'a> Operator<'a> { /// * 'name' - Name to set /// /// ``` - /// # use libceed::prelude::*; + /// # use libceed::{prelude::*, BasisOpt, ElemRestrictionOpt, MemType, QFunctionOpt, QuadMode, VectorOpt}; /// # fn main() -> libceed::Result<()> { /// # let ceed = libceed::Ceed::default_init(); /// let qf = ceed.q_function_interior_by_name("Mass1DBuild")?; /// /// // Operator field arguments /// let ne = 3; - /// let q = 4 as usize; + /// let q = 4_usize; /// let mut ind: Vec = vec![0; 2 * ne]; /// for i in 0..ne { /// ind[2 * i + 0] = i as i32; @@ -575,7 +618,7 @@ impl<'a> Operator<'a> { /// * `output` - Output Vector /// /// ``` - /// # use libceed::prelude::*; + /// # use libceed::{prelude::*, BasisOpt, ElemRestrictionOpt, MemType, QFunctionOpt, QuadMode, Scalar, VectorOpt}; /// # fn main() -> libceed::Result<()> { /// # let ceed = libceed::Ceed::default_init(); /// let ne = 4; @@ -653,7 +696,7 @@ impl<'a> Operator<'a> { /// * `output` - Output Vector /// /// ``` - /// # use libceed::prelude::*; + /// # use libceed::{prelude::*, BasisOpt, ElemRestrictionOpt, MemType, QFunctionOpt, QuadMode, Scalar, VectorOpt}; /// # fn main() -> libceed::Result<()> { /// # let ceed = libceed::Ceed::default_init(); /// let ne = 4; @@ -733,7 +776,7 @@ impl<'a> Operator<'a> { /// /// /// ``` - /// # use libceed::prelude::*; + /// # use libceed::{prelude::*, MemType, QFunctionOpt, QuadMode, VectorOpt}; /// # fn main() -> libceed::Result<()> { /// # let ceed = libceed::Ceed::default_init(); /// let qf = ceed.q_function_interior_by_name("Mass1DBuild")?; @@ -765,8 +808,8 @@ impl<'a> Operator<'a> { v: impl Into>, ) -> crate::Result { let fieldname = CString::new(fieldname).expect("CString::new failed"); - let fieldname = fieldname.as_ptr() as *const i8; - let ierr = unsafe { + let fieldname = fieldname.as_ptr(); + self.op_core.check_error(unsafe { bind_ceed::CeedOperatorSetField( self.op_core.ptr, fieldname, @@ -774,22 +817,21 @@ impl<'a> Operator<'a> { b.into().to_raw(), v.into().to_raw(), ) - }; - self.op_core.check_error(ierr)?; + })?; Ok(self) } /// Get a slice of Operator inputs /// /// ``` - /// # use libceed::prelude::*; + /// # use libceed::{prelude::*, BasisOpt, ElemRestrictionOpt, MemType, QFunctionOpt, QuadMode, Scalar, VectorOpt}; /// # fn main() -> libceed::Result<()> { /// # let ceed = libceed::Ceed::default_init(); /// let qf = ceed.q_function_interior_by_name("Mass1DBuild")?; /// /// // Operator field arguments /// let ne = 3; - /// let q = 4 as usize; + /// let q = 4_usize; /// let mut ind: Vec = vec![0; 2 * ne]; /// for i in 0..ne { /// ind[2 * i + 0] = i as i32; @@ -814,41 +856,52 @@ impl<'a> Operator<'a> { /// # Ok(()) /// # } /// ``` - pub fn inputs(&self) -> crate::Result<&[crate::OperatorField]> { + pub fn inputs(&self) -> crate::Result>> { // Get array of raw C pointers for inputs let mut num_inputs = 0; let mut inputs_ptr = std::ptr::null_mut(); - let ierr = unsafe { + self.op_core.check_error(unsafe { bind_ceed::CeedOperatorGetFields( self.op_core.ptr, &mut num_inputs, &mut inputs_ptr, std::ptr::null_mut() as *mut bind_ceed::CeedInt, - std::ptr::null_mut() as *mut *mut bind_ceed::CeedOperatorField, + std::ptr::null_mut(), ) - }; - self.op_core.check_error(ierr)?; + })?; // Convert raw C pointers to fixed length slice let inputs_slice = unsafe { std::slice::from_raw_parts( - inputs_ptr as *const crate::OperatorField, + inputs_ptr as *mut bind_ceed::CeedOperatorField, num_inputs as usize, ) }; - Ok(inputs_slice) + // And finally build vec + let ceed = { + let ceed_raw = self.op_core.ceed(); + let mut ptr = std::ptr::null_mut(); + unsafe { + bind_ceed::CeedReferenceCopy(ceed_raw, &mut ptr); // refcount + } + crate::Ceed { ptr } + }; + let inputs = (0..num_inputs as usize) + .map(|i| unsafe { crate::OperatorField::from_raw(inputs_slice[i], ceed.clone()) }) + .collect::>>()?; + Ok(inputs) } /// Get a slice of Operator outputs /// /// ``` - /// # use libceed::prelude::*; + /// # use libceed::{prelude::*, BasisOpt, ElemRestrictionOpt, MemType, QFunctionOpt, QuadMode, VectorOpt}; /// # fn main() -> libceed::Result<()> { /// # let ceed = libceed::Ceed::default_init(); /// let qf = ceed.q_function_interior_by_name("Mass1DBuild")?; /// /// // Operator field arguments /// let ne = 3; - /// let q = 4 as usize; + /// let q = 4_usize; /// let mut ind: Vec = vec![0; 2 * ne]; /// for i in 0..ne { /// ind[2 * i + 0] = i as i32; @@ -873,34 +926,45 @@ impl<'a> Operator<'a> { /// # Ok(()) /// # } /// ``` - pub fn outputs(&self) -> crate::Result<&[crate::OperatorField]> { + pub fn outputs(&self) -> crate::Result>> { // Get array of raw C pointers for outputs let mut num_outputs = 0; let mut outputs_ptr = std::ptr::null_mut(); - let ierr = unsafe { + self.op_core.check_error(unsafe { bind_ceed::CeedOperatorGetFields( self.op_core.ptr, std::ptr::null_mut() as *mut bind_ceed::CeedInt, - std::ptr::null_mut() as *mut *mut bind_ceed::CeedOperatorField, + std::ptr::null_mut(), &mut num_outputs, &mut outputs_ptr, ) - }; - self.op_core.check_error(ierr)?; + })?; // Convert raw C pointers to fixed length slice let outputs_slice = unsafe { std::slice::from_raw_parts( - outputs_ptr as *const crate::OperatorField, + outputs_ptr as *mut bind_ceed::CeedOperatorField, num_outputs as usize, ) }; - Ok(outputs_slice) + // And finally build vec + let ceed = { + let ceed_raw = self.op_core.ceed(); + let mut ptr = std::ptr::null_mut(); + unsafe { + bind_ceed::CeedReferenceCopy(ceed_raw, &mut ptr); // refcount + } + crate::Ceed { ptr } + }; + let outputs = (0..num_outputs as usize) + .map(|i| unsafe { crate::OperatorField::from_raw(outputs_slice[i], ceed.clone()) }) + .collect::>>()?; + Ok(outputs) } /// Check if Operator is setup correctly /// /// ``` - /// # use libceed::prelude::*; + /// # use libceed::{prelude::*, BasisOpt, ElemRestrictionOpt, MemType, QFunctionOpt, QuadMode, VectorOpt}; /// # fn main() -> libceed::Result<()> { /// # let ceed = libceed::Ceed::default_init(); /// let ne = 4; @@ -943,7 +1007,7 @@ impl<'a> Operator<'a> { /// /// /// ``` - /// # use libceed::prelude::*; + /// # use libceed::{prelude::*, MemType, QFunctionOpt, QuadMode, VectorOpt}; /// # fn main() -> libceed::Result<()> { /// # let ceed = libceed::Ceed::default_init(); /// let qf = ceed.q_function_interior_by_name("Mass1DBuild")?; @@ -980,7 +1044,7 @@ impl<'a> Operator<'a> { /// /// /// ``` - /// # use libceed::prelude::*; + /// # use libceed::{prelude::*, MemType, QFunctionOpt, QuadMode, VectorOpt}; /// # fn main() -> libceed::Result<()> { /// # let ceed = libceed::Ceed::default_init(); /// let qf = ceed.q_function_interior_by_name("Mass1DBuild")?; @@ -1027,7 +1091,7 @@ impl<'a> Operator<'a> { /// * `assembled` - Vector to store assembled Operator diagonal /// /// ``` - /// # use libceed::prelude::*; + /// # use libceed::{prelude::*, BasisOpt, ElemRestrictionOpt, MemType, QFunctionOpt, QuadMode, VectorOpt}; /// # fn main() -> libceed::Result<()> { /// # let ceed = libceed::Ceed::default_init(); /// let ne = 4; @@ -1134,7 +1198,7 @@ impl<'a> Operator<'a> { /// /// /// ``` - /// # use libceed::prelude::*; + /// # use libceed::{prelude::*, BasisOpt, ElemRestrictionOpt, MemType, QFunctionOpt, QuadMode, Scalar, VectorOpt}; /// # fn main() -> libceed::Result<()> { /// # let ceed = libceed::Ceed::default_init(); /// let ne = 4; @@ -1246,7 +1310,7 @@ impl<'a> Operator<'a> { /// `[nodes, component out, component in]`. /// /// ``` - /// # use libceed::prelude::*; + /// # use libceed::{prelude::*, BasisOpt, ElemRestrictionOpt, EvalMode, MemType, QFunctionInputs, QFunctionOpt, QFunctionOutputs, QuadMode, VectorOpt}; /// # fn main() -> libceed::Result<()> { /// # let ceed = libceed::Ceed::default_init(); /// let ne = 4; @@ -1385,7 +1449,7 @@ impl<'a> Operator<'a> { /// `[nodes, component out, component in]`. /// /// ``` - /// # use libceed::prelude::*; + /// # use libceed::{prelude::*, BasisOpt, ElemRestrictionOpt, EvalMode, MemType, QFunctionInputs, QFunctionOpt, QFunctionOutputs, QuadMode, Scalar, VectorOpt}; /// # fn main() -> libceed::Result<()> { /// # let ceed = libceed::Ceed::default_init(); /// let ne = 4; @@ -1517,7 +1581,7 @@ impl<'a> Operator<'a> { /// * `basis_coarse` - Coarse grid active vector basis /// /// ``` - /// # use libceed::prelude::*; + /// # use libceed::{prelude::*, BasisOpt, ElemRestrictionOpt, MemType, QFunctionOpt, QuadMode, Scalar, VectorOpt}; /// # fn main() -> libceed::Result<()> { /// # let ceed = libceed::Ceed::default_init(); /// let ne = 15; @@ -1625,7 +1689,7 @@ impl<'a> Operator<'a> { /// // Check /// let sum: Scalar = v_fine.view()?.iter().sum(); /// assert!( - /// (sum - 2.0).abs() < 50.0 * libceed::EPSILON, + /// (sum - 2.0).abs() < 200.0 * libceed::EPSILON, /// "Incorrect interval length computed" /// ); /// @@ -1635,7 +1699,7 @@ impl<'a> Operator<'a> { /// // Check /// let sum: Scalar = v_coarse.view()?.iter().sum(); /// assert!( - /// (sum - 2.0).abs() < 50.0 * libceed::EPSILON, + /// (sum - 2.0).abs() < 200.0 * libceed::EPSILON, /// "Incorrect interval length computed" /// ); /// # Ok(()) @@ -1650,7 +1714,7 @@ impl<'a> Operator<'a> { let mut ptr_coarse = std::ptr::null_mut(); let mut ptr_prolong = std::ptr::null_mut(); let mut ptr_restrict = std::ptr::null_mut(); - let ierr = unsafe { + self.op_core.check_error(unsafe { bind_ceed::CeedOperatorMultigridLevelCreate( self.op_core.ptr, p_mult_fine.ptr, @@ -1660,11 +1724,10 @@ impl<'a> Operator<'a> { &mut ptr_prolong, &mut ptr_restrict, ) - }; - self.op_core.check_error(ierr)?; - let op_coarse = Operator::from_raw(ptr_coarse)?; - let op_prolong = Operator::from_raw(ptr_prolong)?; - let op_restrict = Operator::from_raw(ptr_restrict)?; + })?; + let op_coarse = unsafe { Operator::from_raw(ptr_coarse)? }; + let op_prolong = unsafe { Operator::from_raw(ptr_prolong)? }; + let op_restrict = unsafe { Operator::from_raw(ptr_restrict)? }; Ok((op_coarse, op_prolong, op_restrict)) } @@ -1677,7 +1740,7 @@ impl<'a> Operator<'a> { /// * `interp_c_to_f` - Matrix for coarse to fine /// /// ``` - /// # use libceed::prelude::*; + /// # use libceed::{prelude::*, BasisOpt, ElemRestrictionOpt, EvalMode, MemType, QFunctionOpt, QuadMode, Scalar, TransposeMode, VectorOpt}; /// # fn main() -> libceed::Result<()> { /// # let ceed = libceed::Ceed::default_init(); /// let ne = 15; @@ -1814,7 +1877,7 @@ impl<'a> Operator<'a> { /// // Check /// let sum: Scalar = v_fine.view()?.iter().sum(); /// assert!( - /// (sum - 2.0).abs() < 10.0 * libceed::EPSILON, + /// (sum - 2.0).abs() < 200.0 * libceed::EPSILON, /// "Incorrect interval length computed" /// ); /// @@ -1824,7 +1887,7 @@ impl<'a> Operator<'a> { /// // Check /// let sum: Scalar = v_coarse.view()?.iter().sum(); /// assert!( - /// (sum - 2.0).abs() < 10.0 * libceed::EPSILON, + /// (sum - 2.0).abs() < 200.0 * libceed::EPSILON, /// "Incorrect interval length computed" /// ); /// # Ok(()) @@ -1835,12 +1898,12 @@ impl<'a> Operator<'a> { p_mult_fine: &Vector, rstr_coarse: &ElemRestriction, basis_coarse: &Basis, - interpCtoF: &Vec, + interpCtoF: &[crate::Scalar], ) -> crate::Result<(Operator<'b>, Operator<'b>, Operator<'b>)> { let mut ptr_coarse = std::ptr::null_mut(); let mut ptr_prolong = std::ptr::null_mut(); let mut ptr_restrict = std::ptr::null_mut(); - let ierr = unsafe { + self.op_core.check_error(unsafe { bind_ceed::CeedOperatorMultigridLevelCreateTensorH1( self.op_core.ptr, p_mult_fine.ptr, @@ -1851,11 +1914,10 @@ impl<'a> Operator<'a> { &mut ptr_prolong, &mut ptr_restrict, ) - }; - self.op_core.check_error(ierr)?; - let op_coarse = Operator::from_raw(ptr_coarse)?; - let op_prolong = Operator::from_raw(ptr_prolong)?; - let op_restrict = Operator::from_raw(ptr_restrict)?; + })?; + let op_coarse = unsafe { Operator::from_raw(ptr_coarse)? }; + let op_prolong = unsafe { Operator::from_raw(ptr_prolong)? }; + let op_restrict = unsafe { Operator::from_raw(ptr_restrict)? }; Ok((op_coarse, op_prolong, op_restrict)) } @@ -1868,7 +1930,7 @@ impl<'a> Operator<'a> { /// * `interp_c_to_f` - Matrix for coarse to fine /// /// ``` - /// # use libceed::prelude::*; + /// # use libceed::{prelude::*, BasisOpt, ElemRestrictionOpt, EvalMode, MemType, QFunctionOpt, QuadMode, Scalar, TransposeMode, VectorOpt}; /// # fn main() -> libceed::Result<()> { /// # let ceed = libceed::Ceed::default_init(); /// let ne = 15; @@ -2005,7 +2067,7 @@ impl<'a> Operator<'a> { /// // Check /// let sum: Scalar = v_fine.view()?.iter().sum(); /// assert!( - /// (sum - 2.0).abs() < 10.0 * libceed::EPSILON, + /// (sum - 2.0).abs() < 200.0 * libceed::EPSILON, /// "Incorrect interval length computed" /// ); /// @@ -2015,7 +2077,7 @@ impl<'a> Operator<'a> { /// // Check /// let sum: Scalar = v_coarse.view()?.iter().sum(); /// assert!( - /// (sum - 2.0).abs() < 10.0 * libceed::EPSILON, + /// (sum - 2.0).abs() < 200.0 * libceed::EPSILON, /// "Incorrect interval length computed" /// ); /// # Ok(()) @@ -2026,12 +2088,12 @@ impl<'a> Operator<'a> { p_mult_fine: &Vector, rstr_coarse: &ElemRestriction, basis_coarse: &Basis, - interpCtoF: &[Scalar], + interpCtoF: &[crate::Scalar], ) -> crate::Result<(Operator<'b>, Operator<'b>, Operator<'b>)> { let mut ptr_coarse = std::ptr::null_mut(); let mut ptr_prolong = std::ptr::null_mut(); let mut ptr_restrict = std::ptr::null_mut(); - let ierr = unsafe { + self.op_core.check_error(unsafe { bind_ceed::CeedOperatorMultigridLevelCreateH1( self.op_core.ptr, p_mult_fine.ptr, @@ -2042,11 +2104,10 @@ impl<'a> Operator<'a> { &mut ptr_prolong, &mut ptr_restrict, ) - }; - self.op_core.check_error(ierr)?; - let op_coarse = Operator::from_raw(ptr_coarse)?; - let op_prolong = Operator::from_raw(ptr_prolong)?; - let op_restrict = Operator::from_raw(ptr_restrict)?; + })?; + let op_coarse = unsafe { Operator::from_raw(ptr_coarse)? }; + let op_prolong = unsafe { Operator::from_raw(ptr_prolong)? }; + let op_restrict = unsafe { Operator::from_raw(ptr_restrict)? }; Ok((op_coarse, op_prolong, op_restrict)) } } @@ -2058,8 +2119,7 @@ impl<'a> CompositeOperator<'a> { // Constructor pub fn create(ceed: &crate::Ceed) -> crate::Result { let mut ptr = std::ptr::null_mut(); - let ierr = unsafe { bind_ceed::CeedCompositeOperatorCreate(ceed.ptr, &mut ptr) }; - ceed.check_error(ierr)?; + ceed.check_error(unsafe { bind_ceed::CeedOperatorCreateComposite(ceed.ptr, &mut ptr) })?; Ok(Self { op_core: OperatorCore { ptr, @@ -2073,13 +2133,13 @@ impl<'a> CompositeOperator<'a> { /// * 'name' - Name to set /// /// ``` - /// # use libceed::prelude::*; + /// # use libceed::{prelude::*, BasisOpt, MemType, QFunctionOpt, QuadMode, VectorOpt}; /// # fn main() -> libceed::Result<()> { /// # let ceed = libceed::Ceed::default_init(); /// /// // Sub operator field arguments /// let ne = 3; - /// let q = 4 as usize; + /// let q = 4_usize; /// let mut ind: Vec = vec![0; 2 * ne]; /// for i in 0..ne { /// ind[2 * i + 0] = i as i32; @@ -2130,7 +2190,7 @@ impl<'a> CompositeOperator<'a> { /// * `output` - Output Vector /// /// ``` - /// # use libceed::prelude::*; + /// # use libceed::{prelude::*, BasisOpt, ElemRestrictionOpt, MemType, QFunctionOpt, QuadMode, Scalar, VectorOpt}; /// # fn main() -> libceed::Result<()> { /// # let ceed = libceed::Ceed::default_init(); /// let ne = 4; @@ -2227,7 +2287,7 @@ impl<'a> CompositeOperator<'a> { /// * `output` - Output Vector /// /// ``` - /// # use libceed::prelude::*; + /// # use libceed::{prelude::*, BasisOpt, ElemRestrictionOpt, MemType, QFunctionOpt, QuadMode, Scalar, VectorOpt}; /// # fn main() -> libceed::Result<()> { /// # let ceed = libceed::Ceed::default_init(); /// let ne = 4; @@ -2323,7 +2383,7 @@ impl<'a> CompositeOperator<'a> { /// * `subop` - Sub-Operator /// /// ``` - /// # use libceed::prelude::*; + /// # use libceed::{prelude::*, QFunctionOpt}; /// # fn main() -> libceed::Result<()> { /// # let ceed = libceed::Ceed::default_init(); /// let mut op = ceed.composite_operator()?; @@ -2340,16 +2400,16 @@ impl<'a> CompositeOperator<'a> { /// ``` #[allow(unused_mut)] pub fn sub_operator(mut self, subop: &Operator) -> crate::Result { - let ierr = - unsafe { bind_ceed::CeedCompositeOperatorAddSub(self.op_core.ptr, subop.op_core.ptr) }; - self.op_core.check_error(ierr)?; + self.op_core.check_error(unsafe { + bind_ceed::CeedOperatorCompositeAddSub(self.op_core.ptr, subop.op_core.ptr) + })?; Ok(self) } /// Check if CompositeOperator is setup correctly /// /// ``` - /// # use libceed::prelude::*; + /// # use libceed::{prelude::*, BasisOpt, ElemRestrictionOpt, MemType, QFunctionOpt, QuadMode, VectorOpt}; /// # fn main() -> libceed::Result<()> { /// # let ceed = libceed::Ceed::default_init(); /// let ne = 4; diff --git a/rust/libceed/src/qfunction.rs b/rust/libceed/src/qfunction.rs index 0d32d01d28..f1eb5786f9 100644 --- a/rust/libceed/src/qfunction.rs +++ b/rust/libceed/src/qfunction.rs @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -10,7 +10,7 @@ use std::pin::Pin; -use crate::prelude::*; +use crate::{prelude::*, vector::Vector, MAX_QFUNCTION_FIELDS}; pub type QFunctionInputs<'a> = [&'a [crate::Scalar]; MAX_QFUNCTION_FIELDS]; pub type QFunctionOutputs<'a> = [&'a mut [crate::Scalar]; MAX_QFUNCTION_FIELDS]; @@ -82,7 +82,7 @@ impl<'a> QFunctionField<'a> { /// Get the evaluation mode of a QFunctionField /// /// ``` - /// # use libceed::prelude::*; + /// # use libceed::{prelude::*, EvalMode}; /// # fn main() -> libceed::Result<()> { /// # let ceed = libceed::Ceed::default_init(); /// const Q: usize = 8; @@ -108,7 +108,7 @@ impl<'a> QFunctionField<'a> { unsafe { bind_ceed::CeedQFunctionFieldGetEvalMode(self.ptr, &mut mode); } - crate::EvalMode::from_u32(mode as u32) + crate::EvalMode::from_u32(mode) } } @@ -139,7 +139,7 @@ impl<'a> From<&'a QFunctionByName<'_>> for QFunctionOpt<'a> { impl<'a> QFunctionOpt<'a> { /// Transform a Rust libCEED QFunctionOpt into C libCEED CeedQFunction - pub(crate) fn to_raw(self) -> bind_ceed::CeedQFunction { + pub(crate) fn to_raw(&self) -> bind_ceed::CeedQFunction { match self { Self::SomeQFunction(qfunc) => qfunc.qf_core.ptr, Self::SomeQFunctionByName(qfunc) => qfunc.qf_core.ptr, @@ -150,7 +150,7 @@ impl<'a> QFunctionOpt<'a> { /// Check if a QFunctionOpt is Some /// /// ``` - /// # use libceed::prelude::*; + /// # use libceed::{prelude::*, EvalMode, QFunctionInputs, QFunctionOpt, QFunctionOutputs}; /// # fn main() -> libceed::Result<()> { /// # let ceed = libceed::Ceed::default_init(); /// let mut user_f = |[u, weights, ..]: QFunctionInputs, [v, ..]: QFunctionOutputs| { @@ -191,7 +191,7 @@ impl<'a> QFunctionOpt<'a> { /// Check if a QFunctionOpt is SomeQFunction /// /// ``` - /// # use libceed::prelude::*; + /// # use libceed::{prelude::*, EvalMode, QFunctionInputs, QFunctionOpt, QFunctionOutputs}; /// # fn main() -> libceed::Result<()> { /// # let ceed = libceed::Ceed::default_init(); /// let mut user_f = |[u, weights, ..]: QFunctionInputs, [v, ..]: QFunctionOutputs| { @@ -232,7 +232,7 @@ impl<'a> QFunctionOpt<'a> { /// Check if a QFunctionOpt is SomeQFunctionByName /// /// ``` - /// # use libceed::prelude::*; + /// # use libceed::{prelude::*, EvalMode, QFunctionInputs, QFunctionOpt, QFunctionOutputs}; /// # fn main() -> libceed::Result<()> { /// # let ceed = libceed::Ceed::default_init(); /// let mut user_f = |[u, weights, ..]: QFunctionInputs, [v, ..]: QFunctionOutputs| { @@ -282,7 +282,7 @@ impl<'a> QFunctionOpt<'a> { /// Check if a QFunctionOpt is None /// /// ``` - /// # use libceed::prelude::*; + /// # use libceed::{prelude::*, EvalMode, QFunctionInputs, QFunctionOpt, QFunctionOutputs}; /// # fn main() -> libceed::Result<()> { /// # let ceed = libceed::Ceed::default_init(); /// let mut user_f = |[u, weights, ..]: QFunctionInputs, [v, ..]: QFunctionOutputs| { @@ -389,7 +389,7 @@ impl<'a> fmt::Display for QFunctionCore<'a> { /// View a QFunction /// /// ``` -/// # use libceed::prelude::*; +/// # use libceed::{prelude::*, EvalMode, QFunctionInputs, QFunctionOutputs}; /// # fn main() -> libceed::Result<()> { /// # let ceed = libceed::Ceed::default_init(); /// let mut user_f = |[u, weights, ..]: QFunctionInputs, [v, ..]: QFunctionOutputs| { @@ -439,14 +439,16 @@ impl<'a> fmt::Display for QFunctionByName<'a> { // Core functionality // ----------------------------------------------------------------------------- impl<'a> QFunctionCore<'a> { + // Raw Ceed for error handling + #[doc(hidden)] + fn ceed(&self) -> bind_ceed::Ceed { + unsafe { bind_ceed::CeedQFunctionReturnCeed(self.ptr) } + } + // Error handling #[doc(hidden)] fn check_error(&self, ierr: i32) -> crate::Result { - let mut ptr = std::ptr::null_mut(); - unsafe { - bind_ceed::CeedQFunctionGetCeed(self.ptr, &mut ptr); - } - crate::check_error(ptr, ierr) + crate::check_error(|| self.ceed(), ierr) } // Common implementation @@ -460,56 +462,47 @@ impl<'a> QFunctionCore<'a> { v_c[i] = v[i].ptr; } let Q = i32::try_from(Q).unwrap(); - let ierr = unsafe { + self.check_error(unsafe { bind_ceed::CeedQFunctionApply(self.ptr, Q, u_c.as_mut_ptr(), v_c.as_mut_ptr()) - }; - self.check_error(ierr) + }) } - pub fn inputs(&self) -> crate::Result<&[crate::QFunctionField]> { + pub fn inputs(&self) -> crate::Result<&[QFunctionField<'_>]> { // Get array of raw C pointers for inputs let mut num_inputs = 0; let mut inputs_ptr = std::ptr::null_mut(); - let ierr = unsafe { + self.check_error(unsafe { bind_ceed::CeedQFunctionGetFields( self.ptr, &mut num_inputs, &mut inputs_ptr, std::ptr::null_mut() as *mut bind_ceed::CeedInt, - std::ptr::null_mut() as *mut *mut bind_ceed::CeedQFunctionField, + std::ptr::null_mut(), ) - }; - self.check_error(ierr)?; + })?; // Convert raw C pointers to fixed length slice let inputs_slice = unsafe { - std::slice::from_raw_parts( - inputs_ptr as *const crate::QFunctionField, - num_inputs as usize, - ) + std::slice::from_raw_parts(inputs_ptr as *const QFunctionField, num_inputs as usize) }; Ok(inputs_slice) } - pub fn outputs(&self) -> crate::Result<&[crate::QFunctionField]> { + pub fn outputs(&self) -> crate::Result<&[QFunctionField<'_>]> { // Get array of raw C pointers for outputs let mut num_outputs = 0; let mut outputs_ptr = std::ptr::null_mut(); - let ierr = unsafe { + self.check_error(unsafe { bind_ceed::CeedQFunctionGetFields( self.ptr, std::ptr::null_mut() as *mut bind_ceed::CeedInt, - std::ptr::null_mut() as *mut *mut bind_ceed::CeedQFunctionField, + std::ptr::null_mut(), &mut num_outputs, &mut outputs_ptr, ) - }; - self.check_error(ierr)?; + })?; // Convert raw C pointers to fixed length slice let outputs_slice = unsafe { - std::slice::from_raw_parts( - outputs_ptr as *const crate::QFunctionField, - num_outputs as usize, - ) + std::slice::from_raw_parts(outputs_ptr as *const QFunctionField, num_outputs as usize) }; Ok(outputs_slice) } @@ -573,12 +566,6 @@ unsafe extern "C" fn trampoline( (trampoline_data.get_unchecked_mut().user_f)(inputs_array, outputs_array) } -unsafe extern "C" fn destroy_trampoline(ctx: *mut ::std::os::raw::c_void) -> ::std::os::raw::c_int { - let trampoline_data: Pin<&mut QFunctionTrampolineData> = std::mem::transmute(ctx); - drop(trampoline_data); - 0 // Clean error code -} - // ----------------------------------------------------------------------------- // QFunction // ----------------------------------------------------------------------------- @@ -609,7 +596,7 @@ impl<'a> QFunction<'a> { // Create QFunction let vlength = i32::try_from(vlength).unwrap(); - let mut ierr = unsafe { + ceed.check_error(unsafe { bind_ceed::CeedQFunctionCreateInterior( ceed.ptr, vlength, @@ -617,33 +604,27 @@ impl<'a> QFunction<'a> { source_c.as_ptr(), &mut ptr, ) - }; - ceed.check_error(ierr)?; + })?; // Set closure let mut qf_ctx_ptr = std::ptr::null_mut(); - ierr = unsafe { bind_ceed::CeedQFunctionContextCreate(ceed.ptr, &mut qf_ctx_ptr) }; - ceed.check_error(ierr)?; - ierr = unsafe { + ceed.check_error(unsafe { + bind_ceed::CeedQFunctionContextCreate(ceed.ptr, &mut qf_ctx_ptr) + })?; + ceed.check_error(unsafe { bind_ceed::CeedQFunctionContextSetData( qf_ctx_ptr, crate::MemType::Host as bind_ceed::CeedMemType, crate::CopyMode::UsePointer as bind_ceed::CeedCopyMode, std::mem::size_of::(), - std::mem::transmute(trampoline_data.as_ref()), + std::mem::transmute::< + std::pin::Pin<&QFunctionTrampolineData>, + *mut std::ffi::c_void, + >(trampoline_data.as_ref()), ) - }; - ceed.check_error(ierr)?; - ierr = unsafe { - bind_ceed::CeedQFunctionContextSetDataDestroy( - qf_ctx_ptr, - crate::MemType::Host as bind_ceed::CeedMemType, - Some(destroy_trampoline), - ) - }; - ceed.check_error(ierr)?; - ierr = unsafe { bind_ceed::CeedQFunctionSetContext(ptr, qf_ctx_ptr) }; - ceed.check_error(ierr)?; + })?; + ceed.check_error(unsafe { bind_ceed::CeedQFunctionSetContext(ptr, qf_ctx_ptr) })?; + ceed.check_error(unsafe { bind_ceed::CeedQFunctionContextDestroy(&mut qf_ctx_ptr) })?; Ok(Self { qf_core: QFunctionCore { ptr, @@ -661,7 +642,7 @@ impl<'a> QFunction<'a> { /// * `output` - Array of output Vectors /// /// ``` - /// # use libceed::prelude::*; + /// # use libceed::{prelude::*, EvalMode, QFunctionInputs, QFunctionOutputs, Scalar}; /// # fn main() -> libceed::Result<()> { /// # let ceed = libceed::Ceed::default_init(); /// let mut user_f = |[u, weights, ..]: QFunctionInputs, [v, ..]: QFunctionOutputs| { @@ -729,7 +710,7 @@ impl<'a> QFunction<'a> { /// gradients, `EvalMode::Weight` to use quadrature weights /// /// ``` - /// # use libceed::prelude::*; + /// # use libceed::{prelude::*, EvalMode, QFunctionInputs, QFunctionOutputs}; /// # fn main() -> libceed::Result<()> { /// # let ceed = libceed::Ceed::default_init(); /// let mut user_f = |[u, weights, ..]: QFunctionInputs, [v, ..]: QFunctionOutputs| { @@ -763,10 +744,9 @@ impl<'a> QFunction<'a> { i32::try_from(size).unwrap(), emode as bind_ceed::CeedEvalMode, ); - let ierr = unsafe { + self.qf_core.check_error(unsafe { bind_ceed::CeedQFunctionAddInput(self.qf_core.ptr, name_c.as_ptr(), size, emode) - }; - self.qf_core.check_error(ierr)?; + })?; Ok(self) } @@ -780,7 +760,7 @@ impl<'a> QFunction<'a> { /// gradients /// /// ``` - /// # use libceed::prelude::*; + /// # use libceed::{prelude::*, EvalMode, QFunctionInputs, QFunctionOutputs}; /// # fn main() -> libceed::Result<()> { /// # let ceed = libceed::Ceed::default_init(); /// let mut user_f = |[u, weights, ..]: QFunctionInputs, [v, ..]: QFunctionOutputs| { @@ -813,17 +793,16 @@ impl<'a> QFunction<'a> { i32::try_from(size).unwrap(), emode as bind_ceed::CeedEvalMode, ); - let ierr = unsafe { + self.qf_core.check_error(unsafe { bind_ceed::CeedQFunctionAddOutput(self.qf_core.ptr, name_c.as_ptr(), size, emode) - }; - self.qf_core.check_error(ierr)?; + })?; Ok(self) } /// Get a slice of QFunction inputs /// /// ``` - /// # use libceed::prelude::*; + /// # use libceed::{prelude::*, EvalMode, QFunctionInputs, QFunctionOutputs}; /// # fn main() -> libceed::Result<()> { /// # let ceed = libceed::Ceed::default_init(); /// let mut user_f = |[u, weights, ..]: QFunctionInputs, [v, ..]: QFunctionOutputs| { @@ -847,14 +826,14 @@ impl<'a> QFunction<'a> { /// # Ok(()) /// # } /// ``` - pub fn inputs(&self) -> crate::Result<&[crate::QFunctionField]> { + pub fn inputs(&self) -> crate::Result<&[QFunctionField<'_>]> { self.qf_core.inputs() } /// Get a slice of QFunction outputs /// /// ``` - /// # use libceed::prelude::*; + /// # use libceed::{prelude::*, EvalMode, QFunctionInputs, QFunctionOutputs}; /// # fn main() -> libceed::Result<()> { /// # let ceed = libceed::Ceed::default_init(); /// let mut user_f = |[u, weights, ..]: QFunctionInputs, [v, ..]: QFunctionOutputs| { @@ -877,7 +856,7 @@ impl<'a> QFunction<'a> { /// # Ok(()) /// # } /// ``` - pub fn outputs(&self) -> crate::Result<&[crate::QFunctionField]> { + pub fn outputs(&self) -> crate::Result<&[QFunctionField<'_>]> { self.qf_core.outputs() } } @@ -890,10 +869,9 @@ impl<'a> QFunctionByName<'a> { pub fn create(ceed: &crate::Ceed, name: &str) -> crate::Result { let name_c = CString::new(name).expect("CString::new failed"); let mut ptr = std::ptr::null_mut(); - let ierr = unsafe { + ceed.check_error(unsafe { bind_ceed::CeedQFunctionCreateInteriorByName(ceed.ptr, name_c.as_ptr(), &mut ptr) - }; - ceed.check_error(ierr)?; + })?; Ok(Self { qf_core: QFunctionCore { ptr, @@ -909,7 +887,7 @@ impl<'a> QFunctionByName<'a> { /// * `output` - Array of output Vectors /// /// ``` - /// # use libceed::prelude::*; + /// # use libceed::{prelude::*, EvalMode, QFunctionInputs, QFunctionOutputs, Scalar}; /// # fn main() -> libceed::Result<()> { /// # let ceed = libceed::Ceed::default_init(); /// const Q: usize = 8; @@ -982,7 +960,7 @@ impl<'a> QFunctionByName<'a> { /// # Ok(()) /// # } /// ``` - pub fn inputs(&self) -> crate::Result<&[crate::QFunctionField]> { + pub fn inputs(&self) -> crate::Result<&[QFunctionField<'_>]> { self.qf_core.inputs() } @@ -1001,7 +979,7 @@ impl<'a> QFunctionByName<'a> { /// # Ok(()) /// # } /// ``` - pub fn outputs(&self) -> crate::Result<&[crate::QFunctionField]> { + pub fn outputs(&self) -> crate::Result<&[QFunctionField<'_>]> { self.qf_core.outputs() } } diff --git a/rust/libceed/src/vector.rs b/rust/libceed/src/vector.rs index c90d8a295a..a1f9cd5178 100644 --- a/rust/libceed/src/vector.rs +++ b/rust/libceed/src/vector.rs @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -34,7 +34,7 @@ impl<'a> From<&'a Vector<'_>> for VectorOpt<'a> { } impl<'a> VectorOpt<'a> { /// Transform a Rust libCEED VectorOpt into C libCEED CeedVector - pub(crate) fn to_raw(self) -> bind_ceed::CeedVector { + pub(crate) fn to_raw(&self) -> bind_ceed::CeedVector { match self { Self::Some(vec) => vec.ptr, Self::Active => unsafe { bind_ceed::CEED_VECTOR_ACTIVE }, @@ -45,7 +45,7 @@ impl<'a> VectorOpt<'a> { /// Check if a VectorOpt is Some /// /// ``` - /// # use libceed::prelude::*; + /// # use libceed::{prelude::*, VectorOpt}; /// # fn main() -> libceed::Result<()> { /// # let ceed = libceed::Ceed::default_init(); /// let vec = libceed::vector::Vector::from_slice(&ceed, &[1., 2., 3.])?; @@ -71,7 +71,7 @@ impl<'a> VectorOpt<'a> { /// Check if a VectorOpt is Active /// /// ``` - /// # use libceed::prelude::*; + /// # use libceed::{prelude::*, VectorOpt}; /// # fn main() -> libceed::Result<()> { /// # let ceed = libceed::Ceed::default_init(); /// let vec = libceed::vector::Vector::from_slice(&ceed, &[1., 2., 3.])?; @@ -97,7 +97,7 @@ impl<'a> VectorOpt<'a> { /// Check if a VectorOpt is Some /// /// ``` - /// # use libceed::prelude::*; + /// # use libceed::{prelude::*, VectorOpt}; /// # fn main() -> libceed::Result<()> { /// # let ceed = libceed::Ceed::default_init(); /// let vec = libceed::vector::Vector::from_slice(&ceed, &[1., 2., 3.])?; @@ -125,7 +125,7 @@ impl<'a> VectorOpt<'a> { // Vector borrowed slice wrapper // ----------------------------------------------------------------------------- pub struct VectorSliceWrapper<'a> { - pub(crate) vector: crate::Vector<'a>, + pub(crate) vector: Vector<'a>, pub(crate) _slice: &'a mut [crate::Scalar], } @@ -149,7 +149,7 @@ impl<'a> Drop for VectorSliceWrapper<'a> { // ----------------------------------------------------------------------------- impl<'a> VectorSliceWrapper<'a> { fn from_vector_and_slice_mut<'b>( - vec: &'b mut crate::Vector, + vec: &'b mut Vector, slice: &'a mut [crate::Scalar], ) -> crate::Result { assert_eq!(vec.length(), slice.len()); @@ -157,18 +157,16 @@ impl<'a> VectorSliceWrapper<'a> { crate::MemType::Host as bind_ceed::CeedMemType, crate::CopyMode::UsePointer as bind_ceed::CeedCopyMode, ); - let ierr = unsafe { + vec.check_error(unsafe { bind_ceed::CeedVectorSetArray( vec.ptr, host, copy_mode, slice.as_ptr() as *mut crate::Scalar, ) - }; - vec.check_error(ierr)?; - + })?; Ok(Self { - vector: crate::Vector::from_raw(vec.ptr_copy_mut()?)?, + vector: unsafe { Vector::from_raw(vec.ptr_copy_mut()?)? }, _slice: slice, }) } @@ -247,15 +245,14 @@ impl<'a> Vector<'a> { pub fn create(ceed: &crate::Ceed, n: usize) -> crate::Result { let n = isize::try_from(n).unwrap(); let mut ptr = std::ptr::null_mut(); - let ierr = unsafe { bind_ceed::CeedVectorCreate(ceed.ptr, n, &mut ptr) }; - ceed.check_error(ierr)?; + ceed.check_error(unsafe { bind_ceed::CeedVectorCreate(ceed.ptr, n, &mut ptr) })?; Ok(Self { ptr, _lifeline: PhantomData, }) } - pub(crate) fn from_raw(ptr: bind_ceed::CeedVector) -> crate::Result { + pub(crate) unsafe fn from_raw(ptr: bind_ceed::CeedVector) -> crate::Result { Ok(Self { ptr, _lifeline: PhantomData, @@ -264,8 +261,7 @@ impl<'a> Vector<'a> { fn ptr_copy_mut(&mut self) -> crate::Result { let mut ptr_copy = std::ptr::null_mut(); - let ierr = unsafe { bind_ceed::CeedVectorReferenceCopy(self.ptr, &mut ptr_copy) }; - self.check_error(ierr)?; + self.check_error(unsafe { bind_ceed::CeedVectorReferenceCopy(self.ptr, &mut ptr_copy) })?; Ok(ptr_copy) } @@ -276,7 +272,7 @@ impl<'a> Vector<'a> { /// * `vec_source` - vector to copy array values from /// /// ``` - /// # use libceed::prelude::*; + /// # use libceed::{prelude::*, Scalar}; /// # fn main() -> libceed::Result<()> { /// # let ceed = libceed::Ceed::default_init(); /// let a = ceed.vector_from_slice(&[1., 2., 3.])?; @@ -290,9 +286,8 @@ impl<'a> Vector<'a> { /// # } /// ``` /// ``` - pub fn copy_from(&mut self, vec_source: &crate::Vector) -> crate::Result { - let ierr = unsafe { bind_ceed::CeedVectorCopy(vec_source.ptr, self.ptr) }; - self.check_error(ierr) + pub fn copy_from(&mut self, vec_source: &Vector) -> crate::Result { + self.check_error(unsafe { bind_ceed::CeedVectorCopy(vec_source.ptr, self.ptr) }) } /// Create a Vector from a slice @@ -305,7 +300,7 @@ impl<'a> Vector<'a> { /// # use libceed::prelude::*; /// # fn main() -> libceed::Result<()> { /// # let ceed = libceed::Ceed::default_init(); - /// let vec = vector::Vector::from_slice(&ceed, &[1., 2., 3.])?; + /// let vec = libceed::Vector::from_slice(&ceed, &[1., 2., 3.])?; /// assert_eq!(vec.length(), 3, "Incorrect length from slice"); /// # Ok(()) /// # } @@ -340,19 +335,20 @@ impl<'a> Vector<'a> { crate::CopyMode::UsePointer as bind_ceed::CeedCopyMode, ); let v = v.as_ptr() as *mut crate::Scalar; - let ierr = unsafe { bind_ceed::CeedVectorSetArray(x.ptr, host, user_pointer, v) }; - ceed.check_error(ierr)?; + ceed.check_error(unsafe { bind_ceed::CeedVectorSetArray(x.ptr, host, user_pointer, v) })?; Ok(x) } + // Raw Ceed for error handling + #[doc(hidden)] + fn ceed(&self) -> bind_ceed::Ceed { + unsafe { bind_ceed::CeedVectorReturnCeed(self.ptr) } + } + // Error handling #[doc(hidden)] fn check_error(&self, ierr: i32) -> crate::Result { - let mut ptr = std::ptr::null_mut(); - unsafe { - bind_ceed::CeedVectorGetCeed(self.ptr, &mut ptr); - } - crate::check_error(ptr, ierr) + crate::check_error(|| self.ceed(), ierr) } /// Returns the length of a Vector @@ -389,6 +385,23 @@ impl<'a> Vector<'a> { self.length() } + /// Returns true if the Vector contains no elements + /// + /// ``` + /// # use libceed::prelude::*; + /// # fn main() -> libceed::Result<()> { + /// # let ceed = libceed::Ceed::default_init(); + /// let vec = ceed.vector(10)?; + /// assert!(!vec.is_empty(), "Incorrect emptiness"); + /// let empty_vec = ceed.vector(0)?; + /// assert!(empty_vec.is_empty(), "Incorrect emptiness"); + /// # Ok(()) + /// # } + /// ``` + pub fn is_empty(&self) -> bool { + self.length() == 0 + } + /// Set the Vector to a constant value /// /// # arguments @@ -412,8 +425,7 @@ impl<'a> Vector<'a> { /// # } /// ``` pub fn set_value(&mut self, value: crate::Scalar) -> crate::Result { - let ierr = unsafe { bind_ceed::CeedVectorSetValue(self.ptr, value) }; - self.check_error(ierr) + self.check_error(unsafe { bind_ceed::CeedVectorSetValue(self.ptr, value) }) } /// Set values from a slice of the same length @@ -423,7 +435,7 @@ impl<'a> Vector<'a> { /// * `slice` - values to into self; length must match /// /// ``` - /// # use libceed::prelude::*; + /// # use libceed::{prelude::*, Scalar}; /// # fn main() -> libceed::Result<()> { /// # let ceed = libceed::Ceed::default_init(); /// let mut vec = ceed.vector(4)?; @@ -441,15 +453,14 @@ impl<'a> Vector<'a> { crate::MemType::Host as bind_ceed::CeedMemType, crate::CopyMode::CopyValues as bind_ceed::CeedCopyMode, ); - let ierr = unsafe { + self.check_error(unsafe { bind_ceed::CeedVectorSetArray( self.ptr, host, copy_mode, slice.as_ptr() as *mut crate::Scalar, ) - }; - self.check_error(ierr) + }) } /// Wrap a mutable slice in a Vector of the same length @@ -459,7 +470,7 @@ impl<'a> Vector<'a> { /// * `slice` - values to wrap in self; length must match /// /// ``` - /// # use libceed::prelude::*; + /// # use libceed::{prelude::*, Scalar}; /// # fn main() -> libceed::Result<()> { /// # let ceed = libceed::Ceed::default_init(); /// let mut vec = ceed.vector(4)?; @@ -501,7 +512,7 @@ impl<'a> Vector<'a> { &mut self, slice: &'b mut [crate::Scalar], ) -> crate::Result> { - crate::VectorSliceWrapper::from_vector_and_slice_mut(self, slice) + VectorSliceWrapper::from_vector_and_slice_mut(self, slice) } /// Sync the Vector to a specified memtype @@ -511,7 +522,7 @@ impl<'a> Vector<'a> { /// * `mtype` - Memtype to be synced /// /// ``` - /// # use libceed::prelude::*; + /// # use libceed::{prelude::*, MemType}; /// # fn main() -> libceed::Result<()> { /// # let ceed = libceed::Ceed::default_init(); /// let len = 10; @@ -528,9 +539,9 @@ impl<'a> Vector<'a> { /// # } /// ``` pub fn sync(&self, mtype: crate::MemType) -> crate::Result { - let ierr = - unsafe { bind_ceed::CeedVectorSyncArray(self.ptr, mtype as bind_ceed::CeedMemType) }; - self.check_error(ierr) + self.check_error(unsafe { + bind_ceed::CeedVectorSyncArray(self.ptr, mtype as bind_ceed::CeedMemType) + }) } /// Create an immutable view @@ -550,7 +561,7 @@ impl<'a> Vector<'a> { /// # Ok(()) /// # } /// ``` - pub fn view(&self) -> crate::Result { + pub fn view(&self) -> crate::Result> { VectorView::new(self) } @@ -572,7 +583,7 @@ impl<'a> Vector<'a> { /// # Ok(()) /// # } /// ``` - pub fn view_mut(&mut self) -> crate::Result { + pub fn view_mut(&mut self) -> crate::Result> { VectorViewMut::new(self) } @@ -583,7 +594,7 @@ impl<'a> Vector<'a> { /// * `ntype` - Norm type One, Two, or Max /// /// ``` - /// # use libceed::prelude::*; + /// # use libceed::{prelude::*, NormType}; /// # fn main() -> libceed::Result<()> { /// # let ceed = libceed::Ceed::default_init(); /// let vec = ceed.vector_from_slice(&[1., 2., 3., 4.])?; @@ -601,10 +612,9 @@ impl<'a> Vector<'a> { /// ``` pub fn norm(&self, ntype: crate::NormType) -> crate::Result { let mut res: crate::Scalar = 0.0; - let ierr = unsafe { + self.check_error(unsafe { bind_ceed::CeedVectorNorm(self.ptr, ntype as bind_ceed::CeedNormType, &mut res) - }; - self.check_error(ierr)?; + })?; Ok(res) } @@ -615,7 +625,7 @@ impl<'a> Vector<'a> { /// * `alpha` - scaling factor /// /// ``` - /// # use libceed::prelude::*; + /// # use libceed::{prelude::*, Scalar}; /// # fn main() -> libceed::Result<()> { /// # let ceed = libceed::Ceed::default_init(); /// let mut vec = ceed.vector_from_slice(&[0., 1., 2., 3., 4.])?; @@ -629,8 +639,7 @@ impl<'a> Vector<'a> { /// ``` #[allow(unused_mut)] pub fn scale(mut self, alpha: crate::Scalar) -> crate::Result { - let ierr = unsafe { bind_ceed::CeedVectorScale(self.ptr, alpha) }; - self.check_error(ierr)?; + self.check_error(unsafe { bind_ceed::CeedVectorScale(self.ptr, alpha) })?; Ok(self) } @@ -642,7 +651,7 @@ impl<'a> Vector<'a> { /// * `x` - second vector, must be different than self /// /// ``` - /// # use libceed::prelude::*; + /// # use libceed::{prelude::*, Scalar}; /// # fn main() -> libceed::Result<()> { /// # let ceed = libceed::Ceed::default_init(); /// let x = ceed.vector_from_slice(&[0., 1., 2., 3., 4.])?; @@ -656,9 +665,8 @@ impl<'a> Vector<'a> { /// # } /// ``` #[allow(unused_mut)] - pub fn axpy(mut self, alpha: crate::Scalar, x: &crate::Vector) -> crate::Result { - let ierr = unsafe { bind_ceed::CeedVectorAXPY(self.ptr, alpha, x.ptr) }; - self.check_error(ierr)?; + pub fn axpy(mut self, alpha: crate::Scalar, x: &Vector) -> crate::Result { + self.check_error(unsafe { bind_ceed::CeedVectorAXPY(self.ptr, alpha, x.ptr) })?; Ok(self) } @@ -671,7 +679,7 @@ impl<'a> Vector<'a> { /// * `x` - second vector, must be different than self /// /// ``` - /// # use libceed::prelude::*; + /// # use libceed::{prelude::*, Scalar}; /// # fn main() -> libceed::Result<()> { /// # let ceed = libceed::Ceed::default_init(); /// let x = ceed.vector_from_slice(&[0., 1., 2., 3., 4.])?; @@ -689,10 +697,9 @@ impl<'a> Vector<'a> { mut self, alpha: crate::Scalar, beta: crate::Scalar, - x: &crate::Vector, + x: &Vector, ) -> crate::Result { - let ierr = unsafe { bind_ceed::CeedVectorAXPBY(self.ptr, alpha, beta, x.ptr) }; - self.check_error(ierr)?; + self.check_error(unsafe { bind_ceed::CeedVectorAXPBY(self.ptr, alpha, beta, x.ptr) })?; Ok(self) } @@ -704,7 +711,7 @@ impl<'a> Vector<'a> { /// * `y` - second vector for product /// /// ``` - /// # use libceed::prelude::*; + /// # use libceed::{prelude::*, Scalar}; /// # fn main() -> libceed::Result<()> { /// # let ceed = libceed::Ceed::default_init(); /// let mut w = ceed.vector_from_slice(&[0., 1., 2., 3., 4.])?; @@ -719,9 +726,8 @@ impl<'a> Vector<'a> { /// # } /// ``` #[allow(unused_mut)] - pub fn pointwise_mult(mut self, x: &crate::Vector, y: &crate::Vector) -> crate::Result { - let ierr = unsafe { bind_ceed::CeedVectorPointwiseMult(self.ptr, x.ptr, y.ptr) }; - self.check_error(ierr)?; + pub fn pointwise_mult(mut self, x: &Vector, y: &Vector) -> crate::Result { + self.check_error(unsafe { bind_ceed::CeedVectorPointwiseMult(self.ptr, x.ptr, y.ptr) })?; Ok(self) } @@ -732,7 +738,7 @@ impl<'a> Vector<'a> { /// * `x` - second vector for product /// /// ``` - /// # use libceed::prelude::*; + /// # use libceed::{prelude::*, Scalar}; /// # fn main() -> libceed::Result<()> { /// # let ceed = libceed::Ceed::default_init(); /// let mut w = ceed.vector_from_slice(&[0., 1., 2., 3., 4.])?; @@ -746,16 +752,15 @@ impl<'a> Vector<'a> { /// # } /// ``` #[allow(unused_mut)] - pub fn pointwise_scale(mut self, x: &crate::Vector) -> crate::Result { - let ierr = unsafe { bind_ceed::CeedVectorPointwiseMult(self.ptr, self.ptr, x.ptr) }; - self.check_error(ierr)?; + pub fn pointwise_scale(mut self, x: &Vector) -> crate::Result { + self.check_error(unsafe { bind_ceed::CeedVectorPointwiseMult(self.ptr, self.ptr, x.ptr) })?; Ok(self) } /// Compute the pointwise multiplication w = w .* w for a Vector /// /// ``` - /// # use libceed::prelude::*; + /// # use libceed::{prelude::*, Scalar}; /// # fn main() -> libceed::Result<()> { /// # let ceed = libceed::Ceed::default_init(); /// let mut w = ceed.vector_from_slice(&[0., 1., 2., 3., 4.])?; @@ -769,8 +774,9 @@ impl<'a> Vector<'a> { /// ``` #[allow(unused_mut)] pub fn pointwise_square(mut self) -> crate::Result { - let ierr = unsafe { bind_ceed::CeedVectorPointwiseMult(self.ptr, self.ptr, self.ptr) }; - self.check_error(ierr)?; + self.check_error(unsafe { + bind_ceed::CeedVectorPointwiseMult(self.ptr, self.ptr, self.ptr) + })?; Ok(self) } } @@ -791,18 +797,14 @@ impl<'a> VectorView<'a> { /// Construct a VectorView from a Vector reference fn new(vec: &'a Vector) -> crate::Result { let mut array = std::ptr::null(); - let ierr = unsafe { + vec.check_error(unsafe { bind_ceed::CeedVectorGetArrayRead( vec.ptr, crate::MemType::Host as bind_ceed::CeedMemType, &mut array, ) - }; - vec.check_error(ierr)?; - Ok(Self { - vec: vec, - array: array, - }) + })?; + Ok(Self { vec, array }) } } @@ -843,19 +845,15 @@ pub struct VectorViewMut<'a> { impl<'a> VectorViewMut<'a> { /// Construct a VectorViewMut from a Vector reference fn new(vec: &'a mut Vector) -> crate::Result { - let mut ptr = std::ptr::null_mut(); - let ierr = unsafe { + let mut array = std::ptr::null_mut(); + vec.check_error(unsafe { bind_ceed::CeedVectorGetArray( vec.ptr, crate::MemType::Host as bind_ceed::CeedMemType, - &mut ptr, + &mut array, ) - }; - vec.check_error(ierr)?; - Ok(Self { - vec: vec, - array: ptr, - }) + })?; + Ok(Self { vec, array }) } } diff --git a/rust/libceed/tests/version-numbers.rs b/rust/libceed/tests/version-numbers.rs index 5f276eae0b..c0f189e2e1 100644 --- a/rust/libceed/tests/version-numbers.rs +++ b/rust/libceed/tests/version-numbers.rs @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors // All Rights Reserved. See the top-level COPYRIGHT and NOTICE files for details. // // SPDX-License-Identifier: (BSD-2-Clause) diff --git a/setup.py b/setup.py index a2bb813e59..b0d423c815 100644 --- a/setup.py +++ b/setup.py @@ -79,7 +79,6 @@ def make_libceed_so(self, prefix): Development Status :: 4 - Beta Intended Audience :: Developers Intended Audience :: Science/Research -License :: OSI Approved :: BSD License Operating System :: POSIX Programming Language :: C Programming Language :: C++ diff --git a/tests/README.md b/tests/README.md index fd6e426420..031ff5a030 100644 --- a/tests/README.md +++ b/tests/README.md @@ -15,7 +15,8 @@ The tests are organized by API object, and some tests are further organized, as     2. CeedBasis simplex basis tests\     3. CeedBasis non-tensor H(div) basis tests\     4. CeedBasis non-tensor H(curl) basis tests\ -    5. CeedBasis evaluation at arbitrary points tests +    5. CeedBasis evaluation at arbitrary points tests\ + 6. CeedBasis ApplyAdd tests 4. CeedQFunction Tests\     0. CeedQFunction user code tests\     1. CeedQFunction gallery code tests diff --git a/tests/junit.py b/tests/junit.py index 041582c172..6ea4bcb0b6 100755 --- a/tests/junit.py +++ b/tests/junit.py @@ -24,10 +24,12 @@ def create_argparser() -> argparse.ArgumentParser: help='Output mode, junit or tap', default=RunMode.JUNIT) parser.add_argument('-n', '--nproc', type=int, default=1, help='number of MPI processes') - parser.add_argument('-o', '--output', type=Optional[Path], default=None, help='Output file to write test') parser.add_argument('-b', '--junit-batch', type=str, default='', help='Name of JUnit batch for output file') parser.add_argument('-np', '--pool-size', type=int, default=1, help='Number of test cases to run in parallel') - parser.add_argument('-s', '--smartredis_dir', type=str, default='', help='path to SmartSim library, if present') + parser.add_argument('-s', '--search', type=str, default='.*', + help='Search string to filter tests, using `re` package format') + parser.add_argument('-v', '--verbose', action='store_true', default=False, + help='print details for all runs, not just failures') parser.add_argument('test', help='Test executable', nargs='?') return parser @@ -35,6 +37,9 @@ def create_argparser() -> argparse.ArgumentParser: # Necessary functions for running tests class CeedSuiteSpec(SuiteSpec): + def __init__(self): + pass + def get_source_path(self, test: str) -> Path: """Compute path to test source file @@ -45,6 +50,8 @@ def get_source_path(self, test: str) -> Path: Path: Path to source file """ prefix, rest = test.split('-', 1) + if prefix == 'rustqfunctions': + return (Path('examples') / 'rust-qfunctions' / rest).with_suffix('.c') if prefix == 'petsc': return (Path('examples') / 'petsc' / rest).with_suffix('.c') elif prefix == 'mfem': @@ -58,7 +65,10 @@ def get_source_path(self, test: str) -> Path: elif prefix == 'solids': return (Path('examples') / 'solids' / rest).with_suffix('.c') elif test.startswith('ex'): - return (Path('examples') / 'ceed' / test).with_suffix('.c') + if test.endswith('-f'): + return (Path('examples') / 'ceed' / test).with_suffix('.f90') + else: + return (Path('examples') / 'ceed' / test).with_suffix('.c') elif test.endswith('-f'): return (Path('tests') / test).with_suffix('.f90') else: @@ -100,9 +110,6 @@ def check_pre_skip(self, test: str, spec: TestSpec, resource: str, nproc: int) - Returns: Optional[str]: Skip reason, or `None` if test case should not be skipped """ - if contains_any(resource, ['occa']) and startswith_any( - test, ['t4', 't5', 'ex', 'mfem', 'nek', 'petsc', 'fluids', 'solids']): - return 'OCCA mode not supported' if test.startswith('t318') and contains_any(resource, ['/gpu/cuda/ref']): return 'CUDA ref backend not supported' if test.startswith('t506') and contains_any(resource, ['/gpu/cuda/shared']): @@ -123,9 +130,7 @@ def check_post_skip(self, test: str, spec: TestSpec, resource: str, stderr: str) Returns: Optional[str]: Skip reason, or `None` if unexpeced error """ - if 'OCCA backend failed to use' in stderr: - return f'OCCA mode not supported' - elif 'Backend does not implement' in stderr: + if 'Backend does not implement' in stderr: return f'Backend does not implement' elif 'Can only provide HOST memory for this backend' in stderr: return f'Device memory not supported' @@ -170,7 +175,7 @@ def check_required_failure(self, test: str, spec: TestSpec, resource: str, stder elif test_id in ['t215']: fail_str = 'Cannot destroy CeedElemRestriction, a process has read access to the offset data' elif test_id in ['t303']: - fail_str = 'Length of input/output vectors incompatible with basis dimensions' + fail_str = 'Input/output vectors too short for basis and evaluation mode' elif test_id in ['t408']: fail_str = 'CeedQFunctionContextGetData(): Cannot grant CeedQFunctionContext data access, a process has read access' elif test_id in ['t409'] and contains_any(resource, ['memcheck']): @@ -193,46 +198,18 @@ def check_allowed_stdout(self, test: str) -> bool: if __name__ == '__main__': args = create_argparser().parse_args() - # run tests - if 'smartsim' in args.test: - has_smartsim: bool = args.smartredis_dir and Path(args.smartredis_dir).is_dir() - test_cases = [] - - if args.mode is RunMode.TAP: - print(f'1..1') - if has_smartsim: - sys.path.insert(0, str(Path(__file__).parents[1] / "examples" / "fluids")) - from smartsim_regression_framework import SmartSimTest - - test_framework = SmartSimTest(Path(__file__).parent / 'test_dir') - test_framework.setup() - - is_new_subtest = True - subtest_ok = True - for i, backend in enumerate(args.ceed_backends): - test_cases.append(test_framework.test_junit(backend)) - if is_new_subtest and args.mode == RunMode.TAP: - is_new_subtest = False - print(f'# Subtest: {test_cases[0].category}') - print(f' 1..{len(args.ceed_backends)}') - print(test_case_output_string(test_cases[i], TestSpec("SmartSim Tests"), args.mode, backend, '', i)) - if args.mode == RunMode.TAP: - print(f'{"" if subtest_ok else "not "}ok 1 - {test_cases[0].category}') - test_framework.teardown() - elif args.mode is RunMode.TAP: - print(f'ok 1 - # SKIP SmartSim not installed') - result: TestSuite = TestSuite('SmartSim Tests', test_cases) - else: - result: TestSuite = run_tests( - args.test, - args.ceed_backends, - args.mode, - args.nproc, - CeedSuiteSpec(), - args.pool_size) + result: TestSuite = run_tests( + args.test, + args.ceed_backends, + args.mode, + args.nproc, + CeedSuiteSpec(), + args.pool_size, + search=args.search, + verbose=args.verbose) # write output and check for failures if args.mode is RunMode.JUNIT: - write_junit_xml(result, args.output, args.junit_batch) + write_junit_xml(result, args.junit_batch) if has_failures(result): sys.exit(1) diff --git a/tests/junit_common.py b/tests/junit_common.py index 607d21e9ee..ce1115a547 100644 --- a/tests/junit_common.py +++ b/tests/junit_common.py @@ -1,7 +1,8 @@ from abc import ABC, abstractmethod +from collections.abc import Iterable import argparse import csv -from dataclasses import dataclass, field +from dataclasses import dataclass, field, fields import difflib from enum import Enum from math import isclose @@ -10,52 +11,75 @@ import re import subprocess import multiprocessing as mp -from itertools import product import sys import time -from typing import Optional, Tuple, List +from typing import Optional, Tuple, List, Dict, Callable, Iterable, get_origin +import shutil sys.path.insert(0, str(Path(__file__).parent / "junit-xml")) from junit_xml import TestCase, TestSuite, to_xml_report_string # nopep8 +class ParseError(RuntimeError): + """A custom exception for failed parsing.""" + + def __init__(self, message): + super().__init__(message) + + class CaseInsensitiveEnumAction(argparse.Action): """Action to convert input values to lower case prior to converting to an Enum type""" def __init__(self, option_strings, dest, type, default, **kwargs): - if not (issubclass(type, Enum) and issubclass(type, str)): - raise ValueError(f"{type} must be a StrEnum or str and Enum") + if not issubclass(type, Enum): + raise ValueError(f"{type} must be an Enum") # store provided enum type self.enum_type = type - if isinstance(default, str): + if isinstance(default, self.enum_type): + pass + elif isinstance(default, str): default = self.enum_type(default.lower()) - else: + elif isinstance(default, Iterable): default = [self.enum_type(v.lower()) for v in default] + else: + raise argparse.ArgumentTypeError("Invalid value type, must be str or iterable") # prevent automatic type conversion super().__init__(option_strings, dest, default=default, **kwargs) def __call__(self, parser, namespace, values, option_string=None): - if isinstance(values, str): + if isinstance(values, self.enum_type): + pass + elif isinstance(values, str): values = self.enum_type(values.lower()) - else: + elif isinstance(values, Iterable): values = [self.enum_type(v.lower()) for v in values] + else: + raise argparse.ArgumentTypeError("Invalid value type, must be str or iterable") setattr(namespace, self.dest, values) @dataclass class TestSpec: """Dataclass storing information about a single test case""" - name: str + name: str = field(default_factory=str) + csv_rtol: float = -1 + csv_ztol: float = -1 + cgns_tol: float = -1 only: List = field(default_factory=list) args: List = field(default_factory=list) + key_values: Dict = field(default_factory=dict) -class RunMode(str, Enum): +class RunMode(Enum): """Enumeration of run modes, either `RunMode.TAP` or `RunMode.JUNIT`""" - __str__ = str.__str__ - __format__ = str.__format__ - TAP: str = 'tap' - JUNIT: str = 'junit' + TAP = 'tap' + JUNIT = 'junit' + + def __str__(self): + return self.value + + def __repr__(self): + return self.value class SuiteSpec(ABC): @@ -97,6 +121,11 @@ def get_output_path(self, test: str, output_file: str) -> Path: """ raise NotImplementedError + @property + def test_failure_artifacts_path(self) -> Path: + """Path to test failure artifacts""" + return Path('build') / 'test_failure_artifacts' + @property def cgns_tol(self): """Absolute tolerance for CGNS diff""" @@ -106,7 +135,41 @@ def cgns_tol(self): def cgns_tol(self, val): self._cgns_tol = val - def post_test_hook(self, test: str, spec: TestSpec) -> None: + @property + def csv_ztol(self): + """Keyword arguments to be passed to diff_csv()""" + return getattr(self, '_csv_ztol', 3e-10) + + @csv_ztol.setter + def csv_ztol(self, val): + self._csv_ztol = val + + @property + def csv_rtol(self): + """Keyword arguments to be passed to diff_csv()""" + return getattr(self, '_csv_rtol', 1e-6) + + @csv_rtol.setter + def csv_rtol(self, val): + self._csv_rtol = val + + @property + def csv_comment_diff_fn(self): # -> Any | Callable[..., None]: + return getattr(self, '_csv_comment_diff_fn', None) + + @csv_comment_diff_fn.setter + def csv_comment_diff_fn(self, test_fn): + self._csv_comment_diff_fn = test_fn + + @property + def csv_comment_str(self): + return getattr(self, '_csv_comment_str', '#') + + @csv_comment_str.setter + def csv_comment_str(self, comment_str): + self._csv_comment_str = comment_str + + def post_test_hook(self, test: str, spec: TestSpec, backend: str) -> None: """Function callback ran after each test case Args: @@ -181,7 +244,7 @@ def has_cgnsdiff() -> bool: stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=my_env) - return 'not found' not in proc.stderr.decode('utf-8') + return 'not found' not in proc.stderr.decode('utf-8', errors='replace') def contains_any(base: str, substrings: List[str]) -> bool: @@ -210,7 +273,40 @@ def startswith_any(base: str, prefixes: List[str]) -> bool: return any((base.startswith(prefix) for prefix in prefixes)) -def parse_test_line(line: str) -> TestSpec: +def find_matching(line: str, open: str = '(', close: str = ')') -> Tuple[int, int]: + """Find the start and end positions of the first outer paired delimeters + + Args: + line (str): Line to search + open (str, optional): Opening delimiter, must be different than `close`. Defaults to '('. + close (str, optional): Closing delimeter, must be different than `open`. Defaults to ')'. + + Raises: + RuntimeError: If open or close is not a single character + RuntimeError: If open and close are the same characters + + Returns: + Tuple[int]: If matching delimeters are found, return indices in `list`. Otherwise, return end < start. + """ + if len(open) != 1 or len(close) != 1: + raise RuntimeError("`open` and `close` must be single characters") + if open == close: + raise RuntimeError("`open` and `close` must be different characters") + start: int = line.find(open) + if start < 0: + return -1, -1 + count: int = 1 + for i in range(start + 1, len(line)): + if line[i] == open: + count += 1 + if line[i] == close: + count -= 1 + if count == 0: + return start, i + return start, -1 + + +def parse_test_line(line: str, fallback_name: str = '') -> TestSpec: """Parse a single line of TESTARGS and CLI arguments into a `TestSpec` object Args: @@ -219,18 +315,61 @@ def parse_test_line(line: str) -> TestSpec: Returns: TestSpec: Parsed specification of test case """ - args: List[str] = re.findall("(?:\".*?\"|\\S)+", line.strip()) - if args[0] == 'TESTARGS': - return TestSpec(name='', args=args[1:]) - raw_test_args: str = args[0][args[0].index('TESTARGS(') + 9:args[0].rindex(')')] - # transform 'name="myname",only="serial,int32"' into {'name': 'myname', 'only': 'serial,int32'} - test_args: dict = dict([''.join(t).split('=') for t in re.findall(r"""([^,=]+)(=)"([^"]*)\"""", raw_test_args)]) - name: str = test_args.get('name', '') - constraints: List[str] = test_args['only'].split(',') if 'only' in test_args else [] - if len(args) > 1: - return TestSpec(name=name, only=constraints, args=args[1:]) - else: - return TestSpec(name=name, only=constraints) + test_fields = fields(TestSpec) + field_names = [f.name for f in test_fields] + known: Dict = dict() + other: Dict = dict() + if line[0] == "(": + # have key/value pairs to parse + start, end = find_matching(line) + if end < start: + raise ParseError(f"Mismatched parentheses in TESTCASE: {line}") + + keyvalues_str = line[start:end + 1] + keyvalues_pattern = re.compile(r''' + (?:\(\s*|\s*,\s*) # start with open parentheses or comma, no capture + ([A-Za-z]+[\w\-]+) # match key starting with alpha, containing alphanumeric, _, or -; captured as Group 1 + \s*=\s* # key is followed by = (whitespace ignored) + (?: # uncaptured group for OR + "((?:[^"]|\\")+)" # match quoted value (any internal " must be escaped as \"); captured as Group 2 + | ([^=]+) # OR match unquoted value (no equals signs allowed); captured as Group 3 + ) # end uncaptured group for OR + \s*(?=,|\)) # lookahead for either next comma or closing parentheses + ''', re.VERBOSE) + + for match in re.finditer(keyvalues_pattern, keyvalues_str): + if not match: # empty + continue + key = match.group(1) + value = match.group(2) if match.group(2) else match.group(3) + try: + index = field_names.index(key) + if key == "only": # weird bc only is a list + value = [constraint.strip() for constraint in value.split(',')] + try: + # TODO: stop supporting python <=3.8 + known[key] = test_fields[index].type(value) # type: ignore + except TypeError: + # TODO: this is still liable to fail for complex types + known[key] = get_origin(test_fields[index].type)(value) # type: ignore + except ValueError: + other[key] = value + + line = line[end + 1:] + + if not 'name' in known.keys(): + known['name'] = fallback_name + + args_pattern = re.compile(r''' + \s+( # remove leading space + (?:"[^"]+") # match quoted CLI option + | (?:[\S]+) # match anything else that is space separated + ) + ''', re.VERBOSE) + args: List[str] = re.findall(args_pattern, line) + for k, v in other.items(): + print(f"warning, unknown TESTCASE option for test '{known['name']}': {k}={v}") + return TestSpec(**known, key_values=other, args=args) def get_test_args(source_file: Path) -> List[TestSpec]: @@ -257,19 +396,22 @@ def get_test_args(source_file: Path) -> List[TestSpec]: else: raise RuntimeError(f'Unrecognized extension for file: {source_file}') - return [parse_test_line(line.strip(comment_str)) + return [parse_test_line(line.strip(comment_str).removeprefix("TESTARGS"), source_file.stem) for line in source_file.read_text().splitlines() - if line.startswith(f'{comment_str}TESTARGS')] or [TestSpec('', args=['{ceed_resource}'])] + if line.startswith(f'{comment_str}TESTARGS')] or [TestSpec(source_file.stem, args=['{ceed_resource}'])] -def diff_csv(test_csv: Path, true_csv: Path, zero_tol: float = 3e-10, rel_tol: float = 1e-2) -> str: +def diff_csv(test_csv: Path, true_csv: Path, zero_tol: float, rel_tol: float, + comment_str: str = '#', comment_func: Optional[Callable[[str, str], Optional[str]]] = None) -> str: """Compare CSV results against an expected CSV file with tolerances Args: test_csv (Path): Path to output CSV results true_csv (Path): Path to expected CSV results - zero_tol (float, optional): Tolerance below which values are considered to be zero. Defaults to 3e-10. - rel_tol (float, optional): Relative tolerance for comparing non-zero values. Defaults to 1e-2. + zero_tol (float): Tolerance below which values are considered to be zero. + rel_tol (float): Relative tolerance for comparing non-zero values. + comment_str (str, optional): String to denoting commented line + comment_func (Callable, optional): Function to determine if test and true line are different Returns: str: Diff output between result and expected CSVs @@ -281,15 +423,38 @@ def diff_csv(test_csv: Path, true_csv: Path, zero_tol: float = 3e-10, rel_tol: f return f'No lines found in test output {test_csv}' if len(true_lines) == 0: return f'No lines found in test source {true_csv}' + if len(test_lines) != len(true_lines): + return f'Number of lines in {test_csv} and {true_csv} do not match' + + # Process commented lines + uncommented_lines: List[int] = [] + for n, (test_line, true_line) in enumerate(zip(test_lines, true_lines)): + if test_line[0] == comment_str and true_line[0] == comment_str: + if comment_func: + output = comment_func(test_line, true_line) + if output: + return output + elif test_line[0] == comment_str and true_line[0] != comment_str: + return f'Commented line found in {test_csv} at line {n} but not in {true_csv}' + elif test_line[0] != comment_str and true_line[0] == comment_str: + return f'Commented line found in {true_csv} at line {n} but not in {test_csv}' + else: + uncommented_lines.append(n) + + # Remove commented lines + test_lines = [test_lines[line] for line in uncommented_lines] + true_lines = [true_lines[line] for line in uncommented_lines] test_reader: csv.DictReader = csv.DictReader(test_lines) true_reader: csv.DictReader = csv.DictReader(true_lines) + if not test_reader.fieldnames: + return f'No CSV columns found in test output {test_csv}' + if not true_reader.fieldnames: + return f'No CSV columns found in test source {true_csv}' if test_reader.fieldnames != true_reader.fieldnames: return ''.join(difflib.unified_diff([f'{test_lines[0]}\n'], [f'{true_lines[0]}\n'], tofile='found CSV columns', fromfile='expected CSV columns')) - if len(test_lines) != len(true_lines): - return f'Number of lines in {test_csv} and {true_csv} do not match' diff_lines: List[str] = list() for test_line, true_line in zip(test_reader, true_reader): for key in test_reader.fieldnames: @@ -313,13 +478,13 @@ def diff_csv(test_csv: Path, true_csv: Path, zero_tol: float = 3e-10, rel_tol: f return '\n'.join(diff_lines) -def diff_cgns(test_cgns: Path, true_cgns: Path, cgns_tol: float = 1e-12) -> str: +def diff_cgns(test_cgns: Path, true_cgns: Path, cgns_tol: float) -> str: """Compare CGNS results against an expected CGSN file with tolerance Args: test_cgns (Path): Path to output CGNS file true_cgns (Path): Path to expected CGNS file - cgns_tol (float, optional): Tolerance for comparing floating-point values + cgns_tol (float): Tolerance for comparing floating-point values Returns: str: Diff output between result and expected CGNS files @@ -333,35 +498,63 @@ def diff_cgns(test_cgns: Path, true_cgns: Path, cgns_tol: float = 1e-12) -> str: stderr=subprocess.PIPE, env=my_env) - return proc.stderr.decode('utf-8') + proc.stdout.decode('utf-8') + return proc.stderr.decode('utf-8', errors='replace') + proc.stdout.decode('utf-8', errors='replace') + + +def diff_ascii(test_file: Path, true_file: Path, backend: str) -> str: + """Compare ASCII results against an expected ASCII file + + Args: + test_file (Path): Path to output ASCII file + true_file (Path): Path to expected ASCII file + + Returns: + str: Diff output between result and expected ASCII files + """ + tmp_backend: str = backend.replace('/', '-') + true_str: str = true_file.read_text().replace('{ceed_resource}', tmp_backend) + diff = list(difflib.unified_diff(test_file.read_text().splitlines(keepends=True), + true_str.splitlines(keepends=True), + fromfile=str(test_file), + tofile=str(true_file))) + return ''.join(diff) def test_case_output_string(test_case: TestCase, spec: TestSpec, mode: RunMode, - backend: str, test: str, index: int) -> str: + backend: str, test: str, index: int, verbose: bool) -> str: output_str = '' if mode is RunMode.TAP: # print incremental output if TAP mode if test_case.is_skipped(): output_str += f' ok {index} - {spec.name}, {backend} # SKIP {test_case.skipped[0]["message"]}\n' elif test_case.is_failure() or test_case.is_error(): - output_str += f' not ok {index} - {spec.name}, {backend}\n' + output_str += f' not ok {index} - {spec.name}, {backend} ({test_case.elapsed_sec} s)\n' else: - output_str += f' ok {index} - {spec.name}, {backend}\n' - output_str += f' ---\n' - if spec.only: - output_str += f' only: {",".join(spec.only)}\n' - output_str += f' args: {test_case.args}\n' - if test_case.is_error(): - output_str += f' error: {test_case.errors[0]["message"]}\n' - if test_case.is_failure(): - output_str += f' num_failures: {len(test_case.failures)}\n' - for i, failure in enumerate(test_case.failures): - output_str += f' failure_{i}: {failure["message"]}\n' - output_str += f' message: {failure["message"]}\n' - if failure["output"]: - out = failure["output"].strip().replace('\n', '\n ') - output_str += f' output: |\n {out}\n' - output_str += f' ...\n' + output_str += f' ok {index} - {spec.name}, {backend} ({test_case.elapsed_sec} s)\n' + if test_case.is_failure() or test_case.is_error() or verbose: + output_str += f' ---\n' + if spec.only: + output_str += f' only: {",".join(spec.only)}\n' + output_str += f' args: {test_case.args}\n' + if spec.csv_ztol > 0: + output_str += f' csv_ztol: {spec.csv_ztol}\n' + if spec.csv_rtol > 0: + output_str += f' csv_rtol: {spec.csv_rtol}\n' + if spec.cgns_tol > 0: + output_str += f' cgns_tol: {spec.cgns_tol}\n' + for k, v in spec.key_values.items(): + output_str += f' {k}: {v}\n' + if test_case.is_error(): + output_str += f' error: {test_case.errors[0]["message"]}\n' + if test_case.is_failure(): + output_str += f' failures:\n' + for i, failure in enumerate(test_case.failures): + output_str += f' -\n' + output_str += f' message: {failure["message"]}\n' + if failure["output"]: + out = failure["output"].strip().replace('\n', '\n ') + output_str += f' output: |\n {out}\n' + output_str += f' ...\n' else: # print error or failure information if JUNIT mode if test_case.is_error() or test_case.is_failure(): @@ -377,8 +570,20 @@ def test_case_output_string(test_case: TestCase, spec: TestSpec, mode: RunMode, return output_str +def save_failure_artifact(suite_spec: SuiteSpec, file: Path) -> Path: + """Attach a file to a test case + + Args: + test_case (TestCase): Test case to attach the file to + file (Path): Path to the file to attach + """ + save_path: Path = suite_spec.test_failure_artifacts_path / file.name + shutil.copyfile(file, save_path) + return save_path + + def run_test(index: int, test: str, spec: TestSpec, backend: str, - mode: RunMode, nproc: int, suite_spec: SuiteSpec) -> TestCase: + mode: RunMode, nproc: int, suite_spec: SuiteSpec, verbose: bool = False) -> TestCase: """Run a single test case and backend combination Args: @@ -389,6 +594,7 @@ def run_test(index: int, test: str, spec: TestSpec, backend: str, mode (RunMode): Output mode nproc (int): Number of MPI processes to use when running test case suite_spec (SuiteSpec): Specification of test suite + verbose (bool, optional): Print detailed output for all runs, not just failures. Defaults to False. Returns: TestCase: Test case result @@ -407,7 +613,7 @@ def run_test(index: int, test: str, spec: TestSpec, backend: str, run_args = ['mpiexec', '-n', f'{nproc}', *run_args] # run test - skip_reason: str = suite_spec.check_pre_skip(test, spec, backend, nproc) + skip_reason: Optional[str] = suite_spec.check_pre_skip(test, spec, backend, nproc) if skip_reason: test_case: TestCase = TestCase(f'{test}, "{spec.name}", n{nproc}, {backend}', elapsed_sec=0, @@ -428,24 +634,28 @@ def run_test(index: int, test: str, spec: TestSpec, backend: str, classname=source_path.parent, elapsed_sec=time.time() - start, timestamp=time.strftime('%Y-%m-%d %H:%M:%S %Z', time.localtime(start)), - stdout=proc.stdout.decode('utf-8'), - stderr=proc.stderr.decode('utf-8'), + stdout=proc.stdout.decode('utf-8', errors='replace'), + stderr=proc.stderr.decode('utf-8', errors='replace'), allow_multiple_subelements=True, category=spec.name,) ref_csvs: List[Path] = [] - output_files: List[str] = [arg for arg in run_args if 'ascii:' in arg] + ref_ascii: List[Path] = [] + output_files: List[str] = [arg.split(':')[1] for arg in run_args if arg.startswith('ascii:')] if output_files: - ref_csvs = [suite_spec.get_output_path(test, file.split('ascii:')[-1]) for file in output_files] + ref_csvs = [suite_spec.get_output_path(test, file) + for file in output_files if file.endswith('.csv')] + ref_ascii = [suite_spec.get_output_path(test, file) + for file in output_files if not file.endswith('.csv')] ref_cgns: List[Path] = [] - output_files = [arg for arg in run_args if 'cgns:' in arg] + output_files = [arg.split(':')[1] for arg in run_args if arg.startswith('cgns:')] if output_files: - ref_cgns = [suite_spec.get_output_path(test, file.split('cgns:')[-1]) for file in output_files] + ref_cgns = [suite_spec.get_output_path(test, file) for file in output_files] ref_stdout: Path = suite_spec.get_output_path(test, test + '.out') - suite_spec.post_test_hook(test, spec) + suite_spec.post_test_hook(test, spec, backend) # check allowed failures if not test_case.is_skipped() and test_case.stderr: - skip_reason: str = suite_spec.check_post_skip(test, spec, backend, test_case.stderr) + skip_reason: Optional[str] = suite_spec.check_post_skip(test, spec, backend, test_case.stderr) if skip_reason: test_case.add_skipped_info(skip_reason) @@ -460,7 +670,12 @@ def run_test(index: int, test: str, spec: TestSpec, backend: str, # classify other results if not test_case.is_skipped() and not test_case.status: - if test_case.stderr: + # Filter out chipStar (CHIP) runtime informational/warning lines which are not errors + filtered_stderr = '\n'.join( + line for line in test_case.stderr.split('\n') + if not line.startswith(('CHIP info ', 'CHIP warning ', 'CHIP debug ')) + ).strip() + if filtered_stderr: test_case.add_failure_info('stderr', test_case.stderr) if proc.returncode != 0: test_case.add_error_info(f'returncode = {proc.returncode}') @@ -476,35 +691,73 @@ def run_test(index: int, test: str, spec: TestSpec, backend: str, # expected CSV output for ref_csv in ref_csvs: csv_name = ref_csv.name + out_file = Path.cwd() / csv_name if not ref_csv.is_file(): # remove _{ceed_backend} from path name ref_csv = (ref_csv.parent / ref_csv.name.rsplit('_', 1)[0]).with_suffix('.csv') if not ref_csv.is_file(): test_case.add_failure_info('csv', output=f'{ref_csv} not found') + elif not out_file.is_file(): + test_case.add_failure_info('csv', output=f'{out_file} not found') else: - diff: str = diff_csv(Path.cwd() / csv_name, ref_csv) + csv_ztol: float = spec.csv_ztol if spec.csv_ztol > 0 else suite_spec.csv_ztol + csv_rtol: float = spec.csv_rtol if spec.csv_rtol > 0 else suite_spec.csv_rtol + diff = diff_csv( + out_file, + ref_csv, + csv_ztol, + csv_rtol, + suite_spec.csv_comment_str, + suite_spec.csv_comment_diff_fn) if diff: - test_case.add_failure_info('csv', output=diff) + save_path: Path = suite_spec.test_failure_artifacts_path / csv_name + shutil.move(out_file, save_path) + test_case.add_failure_info(f'csv: {save_path}', output=diff) else: - (Path.cwd() / csv_name).unlink() + out_file.unlink() # expected CGNS output for ref_cgn in ref_cgns: cgn_name = ref_cgn.name + out_file = Path.cwd() / cgn_name if not ref_cgn.is_file(): # remove _{ceed_backend} from path name ref_cgn = (ref_cgn.parent / ref_cgn.name.rsplit('_', 1)[0]).with_suffix('.cgns') if not ref_cgn.is_file(): test_case.add_failure_info('cgns', output=f'{ref_cgn} not found') + elif not out_file.is_file(): + test_case.add_failure_info('cgns', output=f'{out_file} not found') + else: + cgns_tol = spec.cgns_tol if spec.cgns_tol > 0 else suite_spec.cgns_tol + diff = diff_cgns(out_file, ref_cgn, cgns_tol=cgns_tol) + if diff: + save_path: Path = suite_spec.test_failure_artifacts_path / cgn_name + shutil.move(out_file, save_path) + test_case.add_failure_info(f'cgns: {save_path}', output=diff) + else: + out_file.unlink() + # expected ASCII output + for ref_file in ref_ascii: + ref_name = ref_file.name + out_file = Path.cwd() / ref_name + if not ref_file.is_file(): + # remove _{ceed_backend} from path name + ref_file = (ref_file.parent / ref_file.name.rsplit('_', 1)[0]).with_suffix(ref_file.suffix) + if not ref_file.is_file(): + test_case.add_failure_info('ascii', output=f'{ref_file} not found') + elif not out_file.is_file(): + test_case.add_failure_info('ascii', output=f'{out_file} not found') else: - diff = diff_cgns(Path.cwd() / cgn_name, ref_cgn, cgns_tol=suite_spec.cgns_tol) + diff = diff_ascii(out_file, ref_file, backend) if diff: - test_case.add_failure_info('cgns', output=diff) + save_path: Path = suite_spec.test_failure_artifacts_path / ref_name + shutil.move(out_file, save_path) + test_case.add_failure_info(f'ascii: {save_path}', output=diff) else: - (Path.cwd() / cgn_name).unlink() + out_file.unlink() # store result test_case.args = ' '.join(str(arg) for arg in run_args) - output_str = test_case_output_string(test_case, spec, mode, backend, test, index) + output_str = test_case_output_string(test_case, spec, mode, backend, test, index, verbose) return test_case, output_str @@ -518,7 +771,7 @@ def init_process(): def run_tests(test: str, ceed_backends: List[str], mode: RunMode, nproc: int, - suite_spec: SuiteSpec, pool_size: int = 1) -> TestSuite: + suite_spec: SuiteSpec, pool_size: int = 1, search: str = ".*", verbose: bool = False) -> TestSuite: """Run all test cases for `test` with each of the provided `ceed_backends` Args: @@ -528,18 +781,23 @@ def run_tests(test: str, ceed_backends: List[str], mode: RunMode, nproc: int, nproc (int): Number of MPI processes to use when running each test case suite_spec (SuiteSpec): Object defining required methods for running tests pool_size (int, optional): Number of processes to use when running tests in parallel. Defaults to 1. + search (str, optional): Regular expression used to match tests. Defaults to ".*". + verbose (bool, optional): Print detailed output for all runs, not just failures. Defaults to False. Returns: TestSuite: JUnit `TestSuite` containing results of all test cases """ - test_specs: List[TestSpec] = get_test_args(suite_spec.get_source_path(test)) + test_specs: List[TestSpec] = [ + t for t in get_test_args(suite_spec.get_source_path(test)) if re.search(search, t.name, re.IGNORECASE) + ] + suite_spec.test_failure_artifacts_path.mkdir(parents=True, exist_ok=True) if mode is RunMode.TAP: print('TAP version 13') print(f'1..{len(test_specs)}') with mp.Pool(processes=pool_size, initializer=init_process) as pool: - async_outputs: List[List[mp.AsyncResult]] = [ - [pool.apply_async(run_test, (i, test, spec, backend, mode, nproc, suite_spec)) + async_outputs: List[List[mp.pool.AsyncResult]] = [ + [pool.apply_async(run_test, (i, test, spec, backend, mode, nproc, suite_spec, verbose)) for (i, backend) in enumerate(ceed_backends, start=1)] for spec in test_specs ] @@ -564,15 +822,14 @@ def run_tests(test: str, ceed_backends: List[str], mode: RunMode, nproc: int, return TestSuite(test, test_cases) -def write_junit_xml(test_suite: TestSuite, output_file: Optional[Path], batch: str = '') -> None: +def write_junit_xml(test_suite: TestSuite, batch: str = '') -> None: """Write a JUnit XML file containing the results of a `TestSuite` Args: test_suite (TestSuite): JUnit `TestSuite` to write - output_file (Optional[Path]): Path to output file, or `None` to generate automatically as `build/{test_suite.name}{batch}.junit` batch (str): Name of JUnit batch, defaults to empty string """ - output_file: Path = output_file or Path('build') / (f'{test_suite.name}{batch}.junit') + output_file = Path('build') / (f'{test_suite.name}{batch}.junit') output_file.write_text(to_xml_report_string([test_suite])) diff --git a/tests/output/t107-vector-f.out b/tests/output/t107-vector-f.out index c4823d39c7..f3faa3e8ea 100644 --- a/tests/output/t107-vector-f.out +++ b/tests/output/t107-vector-f.out @@ -9,3 +9,14 @@ CeedVector length 10 17.00000000 18.00000000 19.00000000 + CeedVector length 10 + 10.00000000 + 11.00000000 + 12.00000000 + 13.00000000 + 14.00000000 + 15.00000000 + 16.00000000 + 17.00000000 + 18.00000000 + 19.00000000 diff --git a/tests/output/t107-vector.out b/tests/output/t107-vector.out index c4823d39c7..f3faa3e8ea 100644 --- a/tests/output/t107-vector.out +++ b/tests/output/t107-vector.out @@ -9,3 +9,14 @@ CeedVector length 10 17.00000000 18.00000000 19.00000000 + CeedVector length 10 + 10.00000000 + 11.00000000 + 12.00000000 + 13.00000000 + 14.00000000 + 15.00000000 + 16.00000000 + 17.00000000 + 18.00000000 + 19.00000000 diff --git a/tests/output/t210-elemrestriction-f.out b/tests/output/t210-elemrestriction-f.out index 0696c8ce32..22990a413e 100644 --- a/tests/output/t210-elemrestriction-f.out +++ b/tests/output/t210-elemrestriction-f.out @@ -1 +1,2 @@ CeedElemRestriction from (4, 1) to 3 elements with 2 nodes each and component stride 1 + CeedElemRestriction from (4, 1) to 3 elements with 2 nodes each and component stride 1 diff --git a/tests/output/t210-elemrestriction.out b/tests/output/t210-elemrestriction.out index 0696c8ce32..22990a413e 100644 --- a/tests/output/t210-elemrestriction.out +++ b/tests/output/t210-elemrestriction.out @@ -1 +1,2 @@ CeedElemRestriction from (4, 1) to 3 elements with 2 nodes each and component stride 1 + CeedElemRestriction from (4, 1) to 3 elements with 2 nodes each and component stride 1 diff --git a/tests/output/t211-elemrestriction-f.out b/tests/output/t211-elemrestriction-f.out index af26a4a612..b2d7a029c4 100644 --- a/tests/output/t211-elemrestriction-f.out +++ b/tests/output/t211-elemrestriction-f.out @@ -1 +1,2 @@ CeedElemRestriction from (6, 1) to 3 elements with 2 nodes each and strides [1, 2, 2] + CeedElemRestriction from (6, 1) to 3 elements with 2 nodes each and strides [1, 2, 2] diff --git a/tests/output/t211-elemrestriction.out b/tests/output/t211-elemrestriction.out index af26a4a612..b2d7a029c4 100644 --- a/tests/output/t211-elemrestriction.out +++ b/tests/output/t211-elemrestriction.out @@ -1 +1,2 @@ CeedElemRestriction from (6, 1) to 3 elements with 2 nodes each and strides [1, 2, 2] + CeedElemRestriction from (6, 1) to 3 elements with 2 nodes each and strides [1, 2, 2] diff --git a/tests/output/t212-elemrestriction-f.out b/tests/output/t212-elemrestriction-f.out index a5cd6de40b..7d72d8c00f 100644 --- a/tests/output/t212-elemrestriction-f.out +++ b/tests/output/t212-elemrestriction-f.out @@ -1 +1,2 @@ Blocked CeedElemRestriction from (6, 1) to 3 elements with 2 nodes each and strides [1, 2, 2] + Blocked CeedElemRestriction from (6, 1) to 3 elements with 2 nodes each and strides [1, 2, 2] diff --git a/tests/output/t212-elemrestriction.out b/tests/output/t212-elemrestriction.out index a5cd6de40b..7d72d8c00f 100644 --- a/tests/output/t212-elemrestriction.out +++ b/tests/output/t212-elemrestriction.out @@ -1 +1,2 @@ Blocked CeedElemRestriction from (6, 1) to 3 elements with 2 nodes each and strides [1, 2, 2] + Blocked CeedElemRestriction from (6, 1) to 3 elements with 2 nodes each and strides [1, 2, 2] diff --git a/tests/output/t300-basis-f.out b/tests/output/t300-basis-f.out index ebbe0f9635..5ab53a4686 100644 --- a/tests/output/t300-basis-f.out +++ b/tests/output/t300-basis-f.out @@ -32,3 +32,20 @@ CeedBasis in a H^1 space on a line element [1] -0.51670214 -0.48795249 1.33790510 -0.33325047 [2] 0.33325047 -1.33790510 0.48795249 0.51670214 [3] -0.18899664 0.63510411 -2.78794489 2.34183742 + CeedBasis in a H^1 space on a line element + P: 4 + Q: 4 + dimension: 1 + field components: 1 + qref1d: -0.86113631 -0.33998104 0.33998104 0.86113631 + qweight1d: 0.34785485 0.65214515 0.65214515 0.34785485 + interp1d: + [0] 0.62994317 0.47255875 -0.14950343 0.04700152 + [1] -0.07069480 0.97297619 0.13253993 -0.03482132 + [2] -0.03482132 0.13253993 0.97297619 -0.07069480 + [3] 0.04700152 -0.14950343 0.47255875 0.62994317 + grad1d: + [0] -2.34183742 2.78794489 -0.63510411 0.18899664 + [1] -0.51670214 -0.48795249 1.33790510 -0.33325047 + [2] 0.33325047 -1.33790510 0.48795249 0.51670214 + [3] -0.18899664 0.63510411 -2.78794489 2.34183742 diff --git a/tests/output/t300-basis.out b/tests/output/t300-basis.out index ebbe0f9635..5ab53a4686 100644 --- a/tests/output/t300-basis.out +++ b/tests/output/t300-basis.out @@ -32,3 +32,20 @@ CeedBasis in a H^1 space on a line element [1] -0.51670214 -0.48795249 1.33790510 -0.33325047 [2] 0.33325047 -1.33790510 0.48795249 0.51670214 [3] -0.18899664 0.63510411 -2.78794489 2.34183742 + CeedBasis in a H^1 space on a line element + P: 4 + Q: 4 + dimension: 1 + field components: 1 + qref1d: -0.86113631 -0.33998104 0.33998104 0.86113631 + qweight1d: 0.34785485 0.65214515 0.65214515 0.34785485 + interp1d: + [0] 0.62994317 0.47255875 -0.14950343 0.04700152 + [1] -0.07069480 0.97297619 0.13253993 -0.03482132 + [2] -0.03482132 0.13253993 0.97297619 -0.07069480 + [3] 0.04700152 -0.14950343 0.47255875 0.62994317 + grad1d: + [0] -2.34183742 2.78794489 -0.63510411 0.18899664 + [1] -0.51670214 -0.48795249 1.33790510 -0.33325047 + [2] 0.33325047 -1.33790510 0.48795249 0.51670214 + [3] -0.18899664 0.63510411 -2.78794489 2.34183742 diff --git a/tests/output/t320-basis-f.out b/tests/output/t320-basis-f.out index a1522dd848..34c78eeaaf 100644 --- a/tests/output/t320-basis-f.out +++ b/tests/output/t320-basis-f.out @@ -19,3 +19,24 @@ CeedBasis in a H^1 space on a triangle element [5] 0.20000000 -2.40000000 0.00000000 0.00000000 2.40000000 -0.20000000 [6] -0.33333333 -1.33333333 0.00000000 0.00000000 1.33333333 0.33333333 [7] 0.20000000 -0.80000000 0.00000000 -1.60000000 0.80000000 1.40000000 + CeedBasis in a H^1 space on a triangle element + P: 6 + Q: 4 + dimension: 2 + field components: 1 + qref: 0.20000000 0.60000000 0.33333333 0.20000000 0.20000000 0.20000000 0.33333333 0.60000000 + qweight: 0.26041667 0.26041667 -0.28125000 0.26041667 + interp: + [0] 0.12000000 0.48000000 -0.12000000 0.48000000 0.16000000 -0.12000000 + [1] -0.12000000 0.48000000 0.12000000 0.16000000 0.48000000 -0.12000000 + [2] -0.11111111 0.44444444 -0.11111111 0.44444444 0.44444444 -0.11111111 + [3] -0.12000000 0.16000000 -0.12000000 0.48000000 0.48000000 0.12000000 + grad: + [0] -1.40000000 1.60000000 -0.20000000 -0.80000000 0.80000000 0.00000000 + [1] 0.20000000 -1.60000000 1.40000000 -0.80000000 0.80000000 0.00000000 + [2] -0.33333333 0.00000000 0.33333333 -1.33333333 1.33333333 0.00000000 + [3] 0.20000000 0.00000000 -0.20000000 -2.40000000 2.40000000 0.00000000 + [4] -1.40000000 -0.80000000 0.00000000 1.60000000 0.80000000 -0.20000000 + [5] 0.20000000 -2.40000000 0.00000000 0.00000000 2.40000000 -0.20000000 + [6] -0.33333333 -1.33333333 0.00000000 0.00000000 1.33333333 0.33333333 + [7] 0.20000000 -0.80000000 0.00000000 -1.60000000 0.80000000 1.40000000 diff --git a/tests/output/t320-basis.out b/tests/output/t320-basis.out index a1522dd848..34c78eeaaf 100644 --- a/tests/output/t320-basis.out +++ b/tests/output/t320-basis.out @@ -19,3 +19,24 @@ CeedBasis in a H^1 space on a triangle element [5] 0.20000000 -2.40000000 0.00000000 0.00000000 2.40000000 -0.20000000 [6] -0.33333333 -1.33333333 0.00000000 0.00000000 1.33333333 0.33333333 [7] 0.20000000 -0.80000000 0.00000000 -1.60000000 0.80000000 1.40000000 + CeedBasis in a H^1 space on a triangle element + P: 6 + Q: 4 + dimension: 2 + field components: 1 + qref: 0.20000000 0.60000000 0.33333333 0.20000000 0.20000000 0.20000000 0.33333333 0.60000000 + qweight: 0.26041667 0.26041667 -0.28125000 0.26041667 + interp: + [0] 0.12000000 0.48000000 -0.12000000 0.48000000 0.16000000 -0.12000000 + [1] -0.12000000 0.48000000 0.12000000 0.16000000 0.48000000 -0.12000000 + [2] -0.11111111 0.44444444 -0.11111111 0.44444444 0.44444444 -0.11111111 + [3] -0.12000000 0.16000000 -0.12000000 0.48000000 0.48000000 0.12000000 + grad: + [0] -1.40000000 1.60000000 -0.20000000 -0.80000000 0.80000000 0.00000000 + [1] 0.20000000 -1.60000000 1.40000000 -0.80000000 0.80000000 0.00000000 + [2] -0.33333333 0.00000000 0.33333333 -1.33333333 1.33333333 0.00000000 + [3] 0.20000000 0.00000000 -0.20000000 -2.40000000 2.40000000 0.00000000 + [4] -1.40000000 -0.80000000 0.00000000 1.60000000 0.80000000 -0.20000000 + [5] 0.20000000 -2.40000000 0.00000000 0.00000000 2.40000000 -0.20000000 + [6] -0.33333333 -1.33333333 0.00000000 0.00000000 1.33333333 0.33333333 + [7] 0.20000000 -0.80000000 0.00000000 -1.60000000 0.80000000 1.40000000 diff --git a/tests/output/t330-basis.out b/tests/output/t330-basis.out index 75e93004fc..1377df2bb5 100644 --- a/tests/output/t330-basis.out +++ b/tests/output/t330-basis.out @@ -34,3 +34,39 @@ CeedBasis in a H(div) space on a quadrilateral element [6] 0.25000000 0.25000000 0.25000000 0.25000000 0.25000000 0.25000000 0.25000000 0.25000000 [7] 0.25000000 0.25000000 0.25000000 0.25000000 0.25000000 0.25000000 0.25000000 0.25000000 [8] 0.25000000 0.25000000 0.25000000 0.25000000 0.25000000 0.25000000 0.25000000 0.25000000 + CeedBasis in a H(div) space on a quadrilateral element + P: 8 + Q: 9 + dimension: 2 + field components: 1 + qref: -0.77459667 0.00000000 0.77459667 -0.77459667 0.00000000 0.77459667 -0.77459667 0.00000000 0.77459667 -0.77459667 -0.77459667 -0.77459667 0.00000000 0.00000000 0.00000000 0.77459667 0.77459667 0.77459667 + qweight: 0.30864198 0.49382716 0.30864198 0.49382716 0.79012346 0.49382716 0.30864198 0.49382716 0.30864198 + interp: + [0] -0.05000000 0.05000000 0.10000000 0.01270167 -0.05000000 0.05000000 -0.78729833 -0.10000000 + [1] -0.12500000 0.12500000 0.44364917 0.05635083 -0.12500000 0.12500000 -0.44364917 -0.05635083 + [2] -0.05000000 0.05000000 0.78729833 0.10000000 -0.05000000 0.05000000 -0.10000000 -0.01270167 + [3] -0.05000000 0.05000000 0.05635083 0.05635083 -0.05000000 0.05000000 -0.44364917 -0.44364917 + [4] -0.12500000 0.12500000 0.25000000 0.25000000 -0.12500000 0.12500000 -0.25000000 -0.25000000 + [5] -0.05000000 0.05000000 0.44364917 0.44364917 -0.05000000 0.05000000 -0.05635083 -0.05635083 + [6] -0.05000000 0.05000000 0.01270167 0.10000000 -0.05000000 0.05000000 -0.10000000 -0.78729833 + [7] -0.12500000 0.12500000 0.05635083 0.44364917 -0.12500000 0.12500000 -0.05635083 -0.44364917 + [8] -0.05000000 0.05000000 0.10000000 0.78729833 -0.05000000 0.05000000 -0.01270167 -0.10000000 + [9] -0.78729833 -0.10000000 -0.05000000 0.05000000 0.10000000 0.01270167 -0.05000000 0.05000000 + [10] -0.44364917 -0.44364917 -0.05000000 0.05000000 0.05635083 0.05635083 -0.05000000 0.05000000 + [11] -0.10000000 -0.78729833 -0.05000000 0.05000000 0.01270167 0.10000000 -0.05000000 0.05000000 + [12] -0.44364917 -0.05635083 -0.12500000 0.12500000 0.44364917 0.05635083 -0.12500000 0.12500000 + [13] -0.25000000 -0.25000000 -0.12500000 0.12500000 0.25000000 0.25000000 -0.12500000 0.12500000 + [14] -0.05635083 -0.44364917 -0.12500000 0.12500000 0.05635083 0.44364917 -0.12500000 0.12500000 + [15] -0.10000000 -0.01270167 -0.05000000 0.05000000 0.78729833 0.10000000 -0.05000000 0.05000000 + [16] -0.05635083 -0.05635083 -0.05000000 0.05000000 0.44364917 0.44364917 -0.05000000 0.05000000 + [17] -0.01270167 -0.10000000 -0.05000000 0.05000000 0.10000000 0.78729833 -0.05000000 0.05000000 + div: + [0] 0.25000000 0.25000000 0.25000000 0.25000000 0.25000000 0.25000000 0.25000000 0.25000000 + [1] 0.25000000 0.25000000 0.25000000 0.25000000 0.25000000 0.25000000 0.25000000 0.25000000 + [2] 0.25000000 0.25000000 0.25000000 0.25000000 0.25000000 0.25000000 0.25000000 0.25000000 + [3] 0.25000000 0.25000000 0.25000000 0.25000000 0.25000000 0.25000000 0.25000000 0.25000000 + [4] 0.25000000 0.25000000 0.25000000 0.25000000 0.25000000 0.25000000 0.25000000 0.25000000 + [5] 0.25000000 0.25000000 0.25000000 0.25000000 0.25000000 0.25000000 0.25000000 0.25000000 + [6] 0.25000000 0.25000000 0.25000000 0.25000000 0.25000000 0.25000000 0.25000000 0.25000000 + [7] 0.25000000 0.25000000 0.25000000 0.25000000 0.25000000 0.25000000 0.25000000 0.25000000 + [8] 0.25000000 0.25000000 0.25000000 0.25000000 0.25000000 0.25000000 0.25000000 0.25000000 diff --git a/tests/output/t340-basis.out b/tests/output/t340-basis.out index 5c97bec2a8..fc3b0b9123 100644 --- a/tests/output/t340-basis.out +++ b/tests/output/t340-basis.out @@ -19,3 +19,24 @@ CeedBasis in a H(curl) space on a triangle element [1] -1.20000000 -1.20000000 -8.40000000 1.20000000 8.40000000 -1.20000000 -9.60000000 9.60000000 [2] -1.20000000 8.40000000 1.20000000 -8.40000000 -1.20000000 -1.20000000 9.60000000 9.60000000 [3] 8.40000000 -1.20000000 1.20000000 1.20000000 -1.20000000 8.40000000 0.00000000 -19.20000000 + CeedBasis in a H(curl) space on a triangle element + P: 8 + Q: 4 + dimension: 2 + field components: 1 + qref: 0.33333333 0.20000000 0.20000000 0.60000000 0.33333333 0.20000000 0.60000000 0.20000000 + qweight: -0.26041667 0.26041667 0.28125000 0.26041667 + interp: + [0] -0.22222222 0.44444444 0.22222222 -0.44444444 -0.22222222 -0.22222222 2.66666667 0.00000000 + [1] 0.08000000 0.48000000 0.56000000 -0.48000000 1.04000000 -0.72000000 2.24000000 -0.64000000 + [2] 0.24000000 -0.48000000 -0.24000000 0.48000000 -0.56000000 -0.56000000 2.88000000 0.00000000 + [3] -0.56000000 0.48000000 -0.08000000 -0.48000000 -0.72000000 1.04000000 1.60000000 0.64000000 + [4] -0.44444444 0.22222222 -0.22222222 -0.22222222 0.22222222 -0.44444444 0.00000000 2.66666667 + [5] -0.48000000 -0.08000000 1.04000000 -0.72000000 0.56000000 -0.48000000 -0.64000000 2.24000000 + [6] -0.48000000 0.56000000 -0.72000000 1.04000000 -0.08000000 -0.48000000 0.64000000 1.60000000 + [7] 0.48000000 -0.24000000 -0.56000000 -0.56000000 -0.24000000 0.48000000 0.00000000 2.88000000 + curl: + [0] 2.00000000 2.00000000 -2.00000000 -2.00000000 2.00000000 2.00000000 0.00000000 0.00000000 + [1] -1.20000000 -1.20000000 -8.40000000 1.20000000 8.40000000 -1.20000000 -9.60000000 9.60000000 + [2] -1.20000000 8.40000000 1.20000000 -8.40000000 -1.20000000 -1.20000000 9.60000000 9.60000000 + [3] 8.40000000 -1.20000000 1.20000000 1.20000000 -1.20000000 8.40000000 0.00000000 -19.20000000 diff --git a/tests/output/t402-qfunction-f.out b/tests/output/t402-qfunction-f.out index 7163a434f1..dc4e005814 100644 --- a/tests/output/t402-qfunction-f.out +++ b/tests/output/t402-qfunction-f.out @@ -26,3 +26,20 @@ User CeedQFunction - mass EvalMode: "interpolation" CeedQFunctionContext Context Data Size: 40 + User CeedQFunction - mass + 2 input fields: + Input field 0: + Name: "qdata" + Size: 1 + EvalMode: "none" + Input field 1: + Name: "u" + Size: 1 + EvalMode: "interpolation" + 1 output field: + Output field 0: + Name: "v" + Size: 1 + EvalMode: "interpolation" + CeedQFunctionContext + Context Data Size: 40 diff --git a/tests/output/t402-qfunction.out b/tests/output/t402-qfunction.out index 4d131f7852..ad131d8d36 100644 --- a/tests/output/t402-qfunction.out +++ b/tests/output/t402-qfunction.out @@ -27,3 +27,21 @@ User CeedQFunction - mass CeedQFunctionContext Context Data Size: 40 Labeled double field: scale + User CeedQFunction - mass + 2 input fields: + Input field 0: + Name: "q data" + Size: 1 + EvalMode: "none" + Input field 1: + Name: "u" + Size: 1 + EvalMode: "interpolation" + 1 output field: + Output field 0: + Name: "v" + Size: 1 + EvalMode: "interpolation" + CeedQFunctionContext + Context Data Size: 40 + Labeled double field: scale diff --git a/tests/output/t413-qfunction-f.out b/tests/output/t413-qfunction-f.out index ffee1bdca7..05731e4204 100644 --- a/tests/output/t413-qfunction-f.out +++ b/tests/output/t413-qfunction-f.out @@ -28,3 +28,18 @@ Gallery CeedQFunction - MassApply Name: "v" Size: 1 EvalMode: "interpolation" + Gallery CeedQFunction - MassApply + 2 input fields: + Input field 0: + Name: "u" + Size: 1 + EvalMode: "interpolation" + Input field 1: + Name: "qdata" + Size: 1 + EvalMode: "none" + 1 output field: + Output field 0: + Name: "v" + Size: 1 + EvalMode: "interpolation" diff --git a/tests/output/t413-qfunction.out b/tests/output/t413-qfunction.out index ffee1bdca7..05731e4204 100644 --- a/tests/output/t413-qfunction.out +++ b/tests/output/t413-qfunction.out @@ -28,3 +28,18 @@ Gallery CeedQFunction - MassApply Name: "v" Size: 1 EvalMode: "interpolation" + Gallery CeedQFunction - MassApply + 2 input fields: + Input field 0: + Name: "u" + Size: 1 + EvalMode: "interpolation" + Input field 1: + Name: "qdata" + Size: 1 + EvalMode: "none" + 1 output field: + Output field 0: + Name: "v" + Size: 1 + EvalMode: "interpolation" diff --git a/tests/output/t504-operator-f.out b/tests/output/t504-operator-f.out index 3fcc6b0458..3b5857619f 100644 --- a/tests/output/t504-operator-f.out +++ b/tests/output/t504-operator-f.out @@ -1,4 +1,4 @@ -CeedOperator +CeedOperator - setup 15 elements with 8 quadrature points each 3 fields 2 input fields: @@ -19,23 +19,23 @@ CeedOperator EvalMode: none No basis Active vector -CeedOperator - 15 elements with 8 quadrature points each - 3 fields - 2 input fields: - Input field 0: - Name: "rho" - Size: 1 - EvalMode: none - No basis - Input field 1: - Name: "u" - Size: 2 - EvalMode: interpolation - Active vector - 1 output field: - Output field 0: - Name: "v" - Size: 2 - EvalMode: interpolation - Active vector + CeedOperator - mass + 15 elements with 8 quadrature points each + 3 fields + 2 input fields: + Input field 0: + Name: "rho" + Size: 1 + EvalMode: none + No basis + Input field 1: + Name: "u" + Size: 2 + EvalMode: interpolation + Active vector + 1 output field: + Output field 0: + Name: "v" + Size: 2 + EvalMode: interpolation + Active vector diff --git a/tests/output/t504-operator.out b/tests/output/t504-operator.out index 3fcc6b0458..4f23570743 100644 --- a/tests/output/t504-operator.out +++ b/tests/output/t504-operator.out @@ -1,4 +1,5 @@ -CeedOperator +CeedOperator - setup +CeedOperator - setup 15 elements with 8 quadrature points each 3 fields 2 input fields: @@ -19,23 +20,24 @@ CeedOperator EvalMode: none No basis Active vector -CeedOperator - 15 elements with 8 quadrature points each - 3 fields - 2 input fields: - Input field 0: - Name: "rho" - Size: 1 - EvalMode: none - No basis - Input field 1: - Name: "u" - Size: 2 - EvalMode: interpolation - Active vector - 1 output field: - Output field 0: - Name: "v" - Size: 2 - EvalMode: interpolation - Active vector + CeedOperator - mass + CeedOperator - mass + 15 elements with 8 quadrature points each + 3 fields + 2 input fields: + Input field 0: + Name: "rho" + Size: 1 + EvalMode: none + No basis + Input field 1: + Name: "u" + Size: 2 + EvalMode: interpolation + Active vector + 1 output field: + Output field 0: + Name: "v" + Size: 2 + EvalMode: interpolation + Active vector diff --git a/tests/output/t523-operator-f.out b/tests/output/t523-operator-f.out index 1817a8a2cf..2a17d484bb 100644 --- a/tests/output/t523-operator-f.out +++ b/tests/output/t523-operator-f.out @@ -39,44 +39,44 @@ Composite CeedOperator - setup Size: 1 EvalMode: none No basis -Composite CeedOperator - mass - SubOperator 0 - triangle elements: - 6 elements with 4 quadrature points each - 3 fields - 2 input fields: - Input field 0: - Name: "rho" - Size: 1 - EvalMode: none - No basis - Input field 1: - Name: "u" - Size: 1 - EvalMode: interpolation - Active vector - 1 output field: - Output field 0: - Name: "v" - Size: 1 - EvalMode: interpolation - Active vector - SubOperator 1 - quadrilateral elements: - 6 elements with 16 quadrature points each - 3 fields - 2 input fields: - Input field 0: - Name: "rho" - Size: 1 - EvalMode: none - No basis - Input field 1: - Name: "u" - Size: 1 - EvalMode: interpolation - Active vector - 1 output field: - Output field 0: - Name: "v" - Size: 1 - EvalMode: interpolation - Active vector + Composite CeedOperator - mass + SubOperator 0 - triangle elements: + 6 elements with 4 quadrature points each + 3 fields + 2 input fields: + Input field 0: + Name: "rho" + Size: 1 + EvalMode: none + No basis + Input field 1: + Name: "u" + Size: 1 + EvalMode: interpolation + Active vector + 1 output field: + Output field 0: + Name: "v" + Size: 1 + EvalMode: interpolation + Active vector + SubOperator 1 - quadrilateral elements: + 6 elements with 16 quadrature points each + 3 fields + 2 input fields: + Input field 0: + Name: "rho" + Size: 1 + EvalMode: none + No basis + Input field 1: + Name: "u" + Size: 1 + EvalMode: interpolation + Active vector + 1 output field: + Output field 0: + Name: "v" + Size: 1 + EvalMode: interpolation + Active vector diff --git a/tests/output/t523-operator.out b/tests/output/t523-operator.out index 1817a8a2cf..742f6954a7 100644 --- a/tests/output/t523-operator.out +++ b/tests/output/t523-operator.out @@ -1,3 +1,6 @@ +Composite CeedOperator - setup + SubOperator 0 - triangle elements + SubOperator 1 - quadrilateral elements Composite CeedOperator - setup SubOperator 0 - triangle elements: 6 elements with 4 quadrature points each @@ -39,44 +42,47 @@ Composite CeedOperator - setup Size: 1 EvalMode: none No basis -Composite CeedOperator - mass - SubOperator 0 - triangle elements: - 6 elements with 4 quadrature points each - 3 fields - 2 input fields: - Input field 0: - Name: "rho" - Size: 1 - EvalMode: none - No basis - Input field 1: - Name: "u" - Size: 1 - EvalMode: interpolation - Active vector - 1 output field: - Output field 0: - Name: "v" - Size: 1 - EvalMode: interpolation - Active vector - SubOperator 1 - quadrilateral elements: - 6 elements with 16 quadrature points each - 3 fields - 2 input fields: - Input field 0: - Name: "rho" - Size: 1 - EvalMode: none - No basis - Input field 1: - Name: "u" - Size: 1 - EvalMode: interpolation - Active vector - 1 output field: - Output field 0: - Name: "v" - Size: 1 - EvalMode: interpolation - Active vector + Composite CeedOperator - mass + SubOperator 0 - triangle elements + SubOperator 1 - quadrilateral elements + Composite CeedOperator - mass + SubOperator 0 - triangle elements: + 6 elements with 4 quadrature points each + 3 fields + 2 input fields: + Input field 0: + Name: "rho" + Size: 1 + EvalMode: none + No basis + Input field 1: + Name: "u" + Size: 1 + EvalMode: interpolation + Active vector + 1 output field: + Output field 0: + Name: "v" + Size: 1 + EvalMode: interpolation + Active vector + SubOperator 1 - quadrilateral elements: + 6 elements with 16 quadrature points each + 3 fields + 2 input fields: + Input field 0: + Name: "rho" + Size: 1 + EvalMode: none + No basis + Input field 1: + Name: "u" + Size: 1 + EvalMode: interpolation + Active vector + 1 output field: + Output field 0: + Name: "v" + Size: 1 + EvalMode: interpolation + Active vector diff --git a/tests/t003-ceed-f.f90 b/tests/t003-ceed-f.f90 index 61c00b3535..00147b869d 100644 --- a/tests/t003-ceed-f.f90 +++ b/tests/t003-ceed-f.f90 @@ -12,6 +12,9 @@ program test call ceedview(ceed,err) + call ceedsetnumviewtabs(ceed,1,err) + call ceedview(ceed,err) + call ceeddestroy(ceed,err) end diff --git a/tests/t003-ceed.c b/tests/t003-ceed.c index 813c0cfe49..9e323c6d49 100644 --- a/tests/t003-ceed.c +++ b/tests/t003-ceed.c @@ -11,6 +11,18 @@ int main(int argc, char **argv) { CeedView(ceed, stdout); + CeedSetNumViewTabs(ceed, 1); + CeedView(ceed, stdout); + + // Check CeedObject interface + { + Ceed ceed_copy = NULL; + + CeedReferenceCopy(ceed, &ceed_copy); + CeedObjectView((CeedObject)ceed_copy, stdout); + CeedObjectDestroy((CeedObject *)&ceed_copy); + } + CeedDestroy(&ceed); return 0; } diff --git a/tests/t008-ceed.c b/tests/t008-ceed.c index 24b3fecff6..344b341ae7 100644 --- a/tests/t008-ceed.c +++ b/tests/t008-ceed.c @@ -11,7 +11,7 @@ int main(int argc, char **argv) { sprintf(help_resource, "help:%s", argv[1]); CeedInit(help_resource, &ceed); - CeedDestroy(&ceed); + CeedDestroy(&ceed); return 0; } diff --git a/tests/t010-config.c b/tests/t010-config.c new file mode 100644 index 0000000000..1becbc6ddd --- /dev/null +++ b/tests/t010-config.c @@ -0,0 +1,14 @@ +/// @file +/// Test git version and build configuration +/// \test Test git version and build configuration +#include +#include + +int main(int argc, char **argv) { + const char *git_version, *build_config; + CeedGetGitVersion(&git_version); + CeedGetBuildConfiguration(&build_config); + // printf("Git: %s\n", git_version); + // puts(build_config); + return 0; +} diff --git a/tests/t107-vector-f.f90 b/tests/t107-vector-f.f90 index 44531fe72b..51c2b79ff5 100644 --- a/tests/t107-vector-f.f90 +++ b/tests/t107-vector-f.f90 @@ -25,6 +25,9 @@ program test call ceedvectorview(x,err) + call ceedvectorsetnumviewtabs(x,1,err) + call ceedvectorview(x,err) + call ceedvectordestroy(x,err) call ceeddestroy(ceed,err) diff --git a/tests/t107-vector.c b/tests/t107-vector.c index ffa27a508d..b6f3eb38df 100644 --- a/tests/t107-vector.c +++ b/tests/t107-vector.c @@ -17,6 +17,16 @@ int main(int argc, char **argv) { CeedVectorView(x, "%12.8f", stdout); + // Check tabs and CeedObject functionality + { + CeedVector x_copy = NULL; + + CeedVectorReferenceCopy(x, &x_copy); + CeedVectorSetNumViewTabs(x_copy, 1); + CeedObjectView((CeedObject)x_copy, stdout); + CeedObjectDestroy((CeedObject *)&x_copy); + } + CeedVectorDestroy(&x); CeedDestroy(&ceed); return 0; diff --git a/tests/t127-vector.c b/tests/t127-vector.c new file mode 100644 index 0000000000..e13bf15d1b --- /dev/null +++ b/tests/t127-vector.c @@ -0,0 +1,60 @@ +/// @file +/// Test strided setting and copying of vectors +/// \test Test strided setting and copying of vectors +#include +#include + +int main(int argc, char **argv) { + Ceed ceed; + CeedSize start = 2, step = 3; + CeedVector x, y; + CeedInt len = 10; + + CeedInit(argv[1], &ceed); + + CeedVectorCreate(ceed, len, &x); + CeedVectorCreate(ceed, len, &y); + + // Set strided + CeedVectorSetValue(x, 1.0); + CeedVectorSetValueStrided(x, start, -1, step, 42.0); + { + const CeedScalar *read_array; + + CeedVectorGetArrayRead(x, CEED_MEM_HOST, &read_array); + for (CeedInt i = 0; i < len; i++) { + CeedScalar value = (i - start) % step == 0 ? 42.0 : 1.0; + + if (read_array[i] != value) { + // LCOV_EXCL_START + printf("Error in setting value in x at index %" CeedInt_FMT ", computed: %f actual: %f\n", i, read_array[i], value); + // LCOV_EXCL_STOP + } + } + CeedVectorRestoreArrayRead(x, &read_array); + } + + // Copy strided + CeedVectorSetValue(y, 0.0); + CeedVectorCopyStrided(x, start, -1, step, y); + { + const CeedScalar *read_array; + + CeedVectorGetArrayRead(y, CEED_MEM_HOST, &read_array); + for (CeedInt i = 0; i < len; i++) { + CeedScalar value = (i - start) % step == 0 ? 42.0 : 0.0; + + if (read_array[i] != value) { + // LCOV_EXCL_START + printf("Error in copying value to y at index %" CeedInt_FMT ", computed: %f actual: %f\n", i, read_array[i], value); + // LCOV_EXCL_STOP + } + } + CeedVectorRestoreArrayRead(y, &read_array); + } + + CeedVectorDestroy(&x); + CeedVectorDestroy(&y); + CeedDestroy(&ceed); + return 0; +} diff --git a/tests/t128-vector.c b/tests/t128-vector.c new file mode 100644 index 0000000000..037b482cbe --- /dev/null +++ b/tests/t128-vector.c @@ -0,0 +1,51 @@ +/// @file +/// Test copying into vector with borrowed pointer +/// \test Test copying into vector with borrowed pointer +#include +#include + +int main(int argc, char **argv) { + Ceed ceed; + CeedVector x, x_copy; + CeedInt len = 10; + CeedScalar array_borrowed[len]; + + CeedInit(argv[1], &ceed); + + CeedVectorCreate(ceed, len, &x); + CeedVectorCreate(ceed, len, &x_copy); + + { + CeedScalar array[len]; + + for (CeedInt i = 0; i < len; i++) { + array[i] = i; + array_borrowed[i] = 10 + i; + } + + CeedVectorSetArray(x, CEED_MEM_HOST, CEED_COPY_VALUES, array); + CeedVectorSetArray(x_copy, CEED_MEM_HOST, CEED_USE_POINTER, array_borrowed); + } + + // Copy to device if preferred + { + CeedMemType mem_type = CEED_MEM_HOST; + + CeedGetPreferredMemType(ceed, &mem_type); + if (mem_type == CEED_MEM_DEVICE) CeedVectorSyncArray(x, CEED_MEM_DEVICE); + } + + // Copy and sync borrowed array + CeedVectorCopy(x, x_copy); + CeedVectorSyncArray(x_copy, CEED_MEM_HOST); + + // Check that borrowed array is the same as the original input array a + for (CeedInt i = 0; i < len; i++) { + if (array_borrowed[i] != i) printf("Error in copying values of CeedVector\n"); + } + + CeedVectorDestroy(&x); + CeedVectorDestroy(&x_copy); + CeedDestroy(&ceed); + return 0; +} diff --git a/tests/t130-vector.c b/tests/t130-vector.c new file mode 100644 index 0000000000..d223a1ad06 --- /dev/null +++ b/tests/t130-vector.c @@ -0,0 +1,44 @@ +/// @file +/// Test getting and restoring work vectors +/// \test Test getting and restoring work vectors + +#include +#include +#include + +int main(int argc, char **argv) { + Ceed ceed; + + CeedInit(argv[1], &ceed); + + // Check for getting the same work vector back + { + CeedVector x, y; + + CeedGetWorkVector(ceed, 20, &x); + // Do not do this! + CeedVector x_copy = x; + + CeedRestoreWorkVector(ceed, &x); + CeedGetWorkVector(ceed, 20, &y); + if (y != x_copy) printf("failed to return same work vector"); + CeedRestoreWorkVector(ceed, &y); + } + + // Check for getting a new work vector back + { + CeedVector x, y; + + CeedGetWorkVector(ceed, 20, &x); + // Do not do this! + CeedVector x_copy = x; + + CeedRestoreWorkVector(ceed, &x); + CeedGetWorkVector(ceed, 30, &y); + if (y == x_copy) printf("failed to return new work vector"); + CeedRestoreWorkVector(ceed, &y); + } + + CeedDestroy(&ceed); + return 0; +} diff --git a/tests/t131-vector.c b/tests/t131-vector.c new file mode 100644 index 0000000000..3fe78c6b94 --- /dev/null +++ b/tests/t131-vector.c @@ -0,0 +1,58 @@ +/// @file +/// Test clearing work vectors +/// \test Test clearing work vectors + +#include +#include +#include +#include + +static CeedScalar expected_usage(CeedSize length) { return length * sizeof(CeedScalar) * 1e-6; } + +int main(int argc, char **argv) { + Ceed ceed; + CeedVector x, y, z; + CeedScalar usage_mb; + + CeedInit(argv[1], &ceed); + + // Add work vectors of different lengths + CeedGetWorkVector(ceed, 10, &x); + CeedGetWorkVector(ceed, 20, &y); + CeedGetWorkVector(ceed, 30, &z); + + // Check memory usage, should be 60 * sizeof(CeedScalar) + CeedGetWorkVectorMemoryUsage(ceed, &usage_mb); + if (fabs(usage_mb - expected_usage(60)) > 100. * CEED_EPSILON) printf("Wrong usage: %0.8g MB != %0.8g MB\n", usage_mb, expected_usage(60)); + + // Restore x and z + CeedRestoreWorkVector(ceed, &x); + CeedRestoreWorkVector(ceed, &z); + + // Clear work vectors with length < 30. This should: + // - Remove x + // - Leave y, since it is still in use + // - Leave z, since it is length 30 + CeedClearWorkVectors(ceed, 30); + CeedGetWorkVectorMemoryUsage(ceed, &usage_mb); + if (fabs(usage_mb - expected_usage(50)) > 100. * CEED_EPSILON) printf("Wrong usage: %0.8g MB != %0.8g MB\n", usage_mb, expected_usage(50)); + + // Clear work vectors with length < 31. This should: + // - Leave y, since it is still in use + // - Remove z + CeedClearWorkVectors(ceed, 31); + CeedGetWorkVectorMemoryUsage(ceed, &usage_mb); + if (fabs(usage_mb - expected_usage(20)) > 100. * CEED_EPSILON) printf("Wrong usage: %0.8g MB != %0.8g MB\n", usage_mb, expected_usage(20)); + + // Restore y + CeedRestoreWorkVector(ceed, &y); + + // Make sure we can still get back y without allocating a new work vector + CeedGetWorkVector(ceed, 20, &y); + CeedGetWorkVectorMemoryUsage(ceed, &usage_mb); + if (fabs(usage_mb - expected_usage(20)) > 100. * CEED_EPSILON) printf("Wrong usage: %0.8g MB != %0.8g MB\n", usage_mb, expected_usage(20)); + CeedRestoreWorkVector(ceed, &y); + + CeedDestroy(&ceed); + return 0; +} diff --git a/tests/t210-elemrestriction-f.f90 b/tests/t210-elemrestriction-f.f90 index b22c4fe5d8..7bad3f941a 100644 --- a/tests/t210-elemrestriction-f.f90 +++ b/tests/t210-elemrestriction-f.f90 @@ -27,6 +27,9 @@ program test call ceedelemrestrictionview(r,err) + call ceedelemrestrictionsetnumviewtabs(r,1,err) + call ceedelemrestrictionview(r,err) + call ceedelemrestrictiondestroy(r,err) call ceeddestroy(ceed,err) diff --git a/tests/t210-elemrestriction.c b/tests/t210-elemrestriction.c index 7aff301411..1cefd2d185 100644 --- a/tests/t210-elemrestriction.c +++ b/tests/t210-elemrestriction.c @@ -19,6 +19,16 @@ int main(int argc, char **argv) { CeedElemRestrictionView(elem_restriction, stdout); + // Check tabs and CeedObject functionality + { + CeedElemRestriction elem_restriction_copy = NULL; + + CeedElemRestrictionReferenceCopy(elem_restriction, &elem_restriction_copy); + CeedElemRestrictionSetNumViewTabs(elem_restriction_copy, 1); + CeedObjectView((CeedObject)elem_restriction_copy, stdout); + CeedObjectDestroy((CeedObject *)&elem_restriction_copy); + } + CeedElemRestrictionDestroy(&elem_restriction); CeedDestroy(&ceed); return 0; diff --git a/tests/t211-elemrestriction-f.f90 b/tests/t211-elemrestriction-f.f90 index 6d86c9c685..4cc27845fd 100644 --- a/tests/t211-elemrestriction-f.f90 +++ b/tests/t211-elemrestriction-f.f90 @@ -20,6 +20,9 @@ program test call ceedelemrestrictionview(r,err) + call ceedelemrestrictionsetnumviewtabs(r,1,err) + call ceedelemrestrictionview(r,err) + call ceedelemrestrictiondestroy(r,err) call ceeddestroy(ceed,err) diff --git a/tests/t211-elemrestriction.c b/tests/t211-elemrestriction.c index 55ba2de881..3318a56f18 100644 --- a/tests/t211-elemrestriction.c +++ b/tests/t211-elemrestriction.c @@ -14,6 +14,8 @@ int main(int argc, char **argv) { CeedInt strides[3] = {1, 2, 2}; CeedElemRestrictionCreateStrided(ceed, num_elem, 2, 1, num_elem * 2, strides, &elem_restriction); + CeedElemRestrictionView(elem_restriction, stdout); + CeedElemRestrictionSetNumViewTabs(elem_restriction, 1); CeedElemRestrictionView(elem_restriction, stdout); CeedElemRestrictionDestroy(&elem_restriction); diff --git a/tests/t212-elemrestriction-f.f90 b/tests/t212-elemrestriction-f.f90 index b36f7c2ea3..9d1341052a 100644 --- a/tests/t212-elemrestriction-f.f90 +++ b/tests/t212-elemrestriction-f.f90 @@ -21,6 +21,9 @@ program test call ceedelemrestrictionview(r,err) + call ceedelemrestrictionsetnumviewtabs(r,1,err) + call ceedelemrestrictionview(r,err) + call ceedelemrestrictiondestroy(r,err) call ceeddestroy(ceed,err) diff --git a/tests/t212-elemrestriction.c b/tests/t212-elemrestriction.c index 99f5dc1cea..3914727201 100644 --- a/tests/t212-elemrestriction.c +++ b/tests/t212-elemrestriction.c @@ -14,6 +14,8 @@ int main(int argc, char **argv) { CeedInt strides[3] = {1, 2, 2}; CeedElemRestrictionCreateBlockedStrided(ceed, num_elem, 2, 2, 1, num_elem * 2, strides, &elem_restriction); + CeedElemRestrictionView(elem_restriction, stdout); + CeedElemRestrictionSetNumViewTabs(elem_restriction, 1); CeedElemRestrictionView(elem_restriction, stdout); CeedElemRestrictionDestroy(&elem_restriction); diff --git a/tests/t217-elemrestriction.c b/tests/t217-elemrestriction.c index ca4f62a048..b9c52c52eb 100644 --- a/tests/t217-elemrestriction.c +++ b/tests/t217-elemrestriction.c @@ -55,10 +55,11 @@ int main(int argc, char **argv) { CeedVectorGetArrayRead(x, CEED_MEM_HOST, &x_array); for (CeedInt i = 0; i < num_elem + 1; i++) { - if (x_array[i] != (10 + i) * (i > 0 && i < num_elem ? 2.0 : 1.0)) + if (x_array[i] != (10 + i) * (i > 0 && i < num_elem ? 2.0 : 1.0)) { // LCOV_EXCL_START printf("Error in restricted array x[%" CeedInt_FMT "] = %f\n", i, (CeedScalar)x_array[i]); - // LCOV_EXCL_STOP + // LCOV_EXCL_STOP + } } CeedVectorRestoreArrayRead(x, &x_array); } diff --git a/tests/t231-elemrestriction.c b/tests/t231-elemrestriction.c index de6cd2466e..21077001aa 100644 --- a/tests/t231-elemrestriction.c +++ b/tests/t231-elemrestriction.c @@ -2,6 +2,7 @@ /// Test creation, use, and destruction of an element restriction at points /// \test Test creation, use, and destruction of an element restriction at points #include +#include #include int main(int argc, char **argv) { @@ -13,57 +14,61 @@ int main(int argc, char **argv) { CeedInit(argv[1], &ceed); - CeedVectorCreate(ceed, num_points, &x); { - CeedInt point_index = num_elem; - CeedScalar array[num_points]; + CeedInt offset = num_elem + 1; + CeedInt point_index = num_elem; for (CeedInt i = 0; i < num_elem; i++) { CeedInt num_points_in_elem = (i + 1) % num_elem + 1; + ind[i] = offset; for (CeedInt j = 0; j < num_points_in_elem; j++) { - array[point_index] = i; - point_index = (point_index + 1) % num_points; + ind[offset + j] = point_index; + point_index = (point_index + 1) % num_points; } + offset += num_points_in_elem; } - CeedVectorSetArray(x, CEED_MEM_HOST, CEED_COPY_VALUES, array); + ind[num_elem] = offset; } - CeedVectorCreate(ceed, num_points, &y); + CeedElemRestrictionCreateAtPoints(ceed, num_elem, num_points, 1, num_points, CEED_MEM_HOST, CEED_USE_POINTER, ind, &elem_restriction); + CeedElemRestrictionCreateVector(elem_restriction, &x, &y); + CeedVectorSetValue(y, 0.0); { - CeedInt offset = num_elem + 1; - CeedInt point_index = num_elem; + CeedInt point_index = num_elem; + CeedScalar array[num_points]; for (CeedInt i = 0; i < num_elem; i++) { CeedInt num_points_in_elem = (i + 1) % num_elem + 1; - ind[i] = offset; for (CeedInt j = 0; j < num_points_in_elem; j++) { - ind[offset + j] = point_index; - point_index = (point_index + 1) % num_points; + array[point_index] = i; + point_index = (point_index + 1) % num_points; } - offset += num_points_in_elem; } - ind[num_elem] = offset; + CeedVectorSetArray(x, CEED_MEM_HOST, CEED_COPY_VALUES, array); } - CeedElemRestrictionCreateAtPoints(ceed, num_elem, num_points, 1, num_points, CEED_MEM_HOST, CEED_USE_POINTER, ind, &elem_restriction); + CeedElemRestrictionApply(elem_restriction, CEED_NOTRANSPOSE, x, y, CEED_REQUEST_IMMEDIATE); { - CeedInt index = 0; + CeedInt e_layout[3]; const CeedScalar *read_array; CeedVectorGetArrayRead(y, CEED_MEM_HOST, &read_array); + CeedElemRestrictionGetELayout(elem_restriction, e_layout); for (CeedInt i = 0; i < num_elem; i++) { - CeedInt num_points_in_elem = (i + 1) % num_elem + 1; + CeedSize elem_offset = 0; + const CeedInt num_points_in_elem = (i + 1) % num_elem + 1; + CeedElemRestrictionGetAtPointsElementOffset(elem_restriction, i, &elem_offset); for (CeedInt j = 0; j < num_points_in_elem; j++) { - if (i != read_array[index]) { + if (i != read_array[elem_offset + j * e_layout[0]]) { // LCOV_EXCL_START - printf("Error in restricted array y[%" CeedInt_FMT "] = %f\n", index, (CeedScalar)read_array[i]); + printf("Error in restricted array y[%" CeedInt_FMT "] = %f != %f\n", (CeedInt)elem_offset + j * e_layout[0], + (CeedScalar)read_array[elem_offset + j * e_layout[0]], (CeedScalar)i); // LCOV_EXCL_STOP } - index++; } } CeedVectorRestoreArrayRead(y, &read_array); diff --git a/tests/t232-elemrestriction.c b/tests/t232-elemrestriction.c index 66557dba28..7632875fcf 100644 --- a/tests/t232-elemrestriction.c +++ b/tests/t232-elemrestriction.c @@ -1,7 +1,8 @@ /// @file -/// Test creation, use, and destruction of an element restriction at points for single elements -/// \test Test creation, use, and destruction of an element restriction at points for single elements +/// Test creation, use, and destruction of an element restriction at points +/// \test Test creation, use, and destruction of an element restriction at points #include +#include #include int main(int argc, char **argv) { @@ -13,22 +14,6 @@ int main(int argc, char **argv) { CeedInit(argv[1], &ceed); - CeedVectorCreate(ceed, num_points, &x); - { - CeedInt point_index = num_elem; - CeedScalar array[num_points]; - - for (CeedInt i = 0; i < num_elem; i++) { - CeedInt num_points_in_elem = (i + 1) % num_elem + 1; - - for (CeedInt j = 0; j < num_points_in_elem; j++) { - array[point_index] = i; - point_index = (point_index + 1) % num_points; - } - } - CeedVectorSetArray(x, CEED_MEM_HOST, CEED_COPY_VALUES, array); - } - { CeedInt offset = num_elem + 1; CeedInt point_index = num_elem; @@ -47,30 +32,43 @@ int main(int argc, char **argv) { } CeedElemRestrictionCreateAtPoints(ceed, num_elem, num_points, 1, num_points, CEED_MEM_HOST, CEED_USE_POINTER, ind, &elem_restriction); + CeedElemRestrictionCreateVector(elem_restriction, &x, &y); + CeedVectorSetValue(y, 0.0); { - CeedInt max_points; + CeedInt point_index = num_elem; + CeedScalar array[num_points]; - CeedElemRestrictionGetMaxPointsInElement(elem_restriction, &max_points); - CeedVectorCreate(ceed, max_points, &y); + for (CeedInt i = 0; i < num_elem; i++) { + CeedInt num_points_in_elem = (i + 1) % num_elem + 1; + + for (CeedInt j = 0; j < num_points_in_elem; j++) { + array[point_index] = i; + point_index = (point_index + 1) % num_points; + } + } + CeedVectorSetArray(x, CEED_MEM_HOST, CEED_COPY_VALUES, array); } + CeedElemRestrictionApply(elem_restriction, CEED_NOTRANSPOSE, x, y, CEED_REQUEST_IMMEDIATE); + CeedElemRestrictionApply(elem_restriction, CEED_TRANSPOSE, y, x, CEED_REQUEST_IMMEDIATE); { - for (CeedInt i = 0; i < num_elem; i++) { - CeedInt num_points_in_elem = (i + 1) % num_elem + 1; - const CeedScalar *read_array; + CeedInt point_index = num_elem; + const CeedScalar *read_array; - CeedElemRestrictionApplyAtPointsInElement(elem_restriction, i, CEED_NOTRANSPOSE, x, y, CEED_REQUEST_IMMEDIATE); - CeedVectorGetArrayRead(y, CEED_MEM_HOST, &read_array); + CeedVectorGetArrayRead(x, CEED_MEM_HOST, &read_array); + for (CeedInt i = 0; i < num_elem; i++) { + CeedInt num_points_in_elem = (i + 1) % num_elem + 1; for (CeedInt j = 0; j < num_points_in_elem; j++) { - if (i != read_array[j]) { + if (read_array[point_index] != 2 * i) { // LCOV_EXCL_START - printf("Error in restricted element array %" CeedInt_FMT " y[%" CeedInt_FMT "] = %f\n", i, j, (CeedScalar)read_array[j]); + printf("Error in restricted array x[%" CeedInt_FMT "] = %f != %f\n", point_index, read_array[point_index], 2.0 * i); // LCOV_EXCL_STOP } + point_index = (point_index + 1) % num_points; } - CeedVectorRestoreArrayRead(y, &read_array); } + CeedVectorRestoreArrayRead(x, &read_array); } CeedVectorDestroy(&x); diff --git a/tests/t233-elemrestriction.c b/tests/t233-elemrestriction.c index 1ad395b4d1..3573e1c349 100644 --- a/tests/t233-elemrestriction.c +++ b/tests/t233-elemrestriction.c @@ -1,8 +1,7 @@ /// @file -/// Test creation, transpose use, and destruction of an element restriction at points for single elements -/// \test Test creation, transpose use, and destruction of an element restriction at points for single elements +/// Test creation, use, and destruction of an element restriction at points for single elements +/// \test Test creation, use, and destruction of an element restriction at points for single elements #include -#include #include int main(int argc, char **argv) { @@ -14,9 +13,6 @@ int main(int argc, char **argv) { CeedInit(argv[1], &ceed); - CeedVectorCreate(ceed, num_points, &x); - CeedVectorSetValue(x, 0.0); - { CeedInt offset = num_elem + 1; CeedInt point_index = num_elem; @@ -33,38 +29,53 @@ int main(int argc, char **argv) { } ind[num_elem] = offset; } - CeedElemRestrictionCreateAtPoints(ceed, num_elem, num_points, 1, num_points, CEED_MEM_HOST, CEED_COPY_VALUES, ind, &elem_restriction); + CeedElemRestrictionCreateAtPoints(ceed, num_elem, num_points, 1, num_points, CEED_MEM_HOST, CEED_USE_POINTER, ind, &elem_restriction); + + CeedElemRestrictionCreateVector(elem_restriction, &x, NULL); + { + CeedInt point_index = num_elem; + CeedScalar array[num_points]; + + for (CeedInt i = 0; i < num_elem; i++) { + CeedInt num_points_in_elem = (i + 1) % num_elem + 1; + + for (CeedInt j = 0; j < num_points_in_elem; j++) { + array[point_index] = i; + point_index = (point_index + 1) % num_points; + } + } + CeedVectorSetArray(x, CEED_MEM_HOST, CEED_COPY_VALUES, array); + } { - CeedInt max_points; + CeedInt min_points, max_points; + CeedElemRestrictionGetMinPointsInElement(elem_restriction, &min_points); CeedElemRestrictionGetMaxPointsInElement(elem_restriction, &max_points); + if (min_points != 1 || max_points != num_elem) { + // LCOV_EXCL_START + printf("Error in min/max points: min %" CeedInt_FMT " max %" CeedInt_FMT "\n", min_points, max_points); + // LCOV_EXCL_STOP + } CeedVectorCreate(ceed, max_points, &y); - CeedVectorSetValue(y, 1.0); } { for (CeedInt i = 0; i < num_elem; i++) { - CeedInt point_index = num_elem; + CeedInt num_points_in_elem = (i + 1) % num_elem + 1; const CeedScalar *read_array; - CeedVectorSetValue(x, 0.0); - CeedElemRestrictionApplyAtPointsInElement(elem_restriction, i, CEED_TRANSPOSE, y, x, CEED_REQUEST_IMMEDIATE); - - CeedVectorGetArrayRead(x, CEED_MEM_HOST, &read_array); - for (CeedInt j = 0; j < num_elem; j++) { - CeedInt num_points_in_elem = (j + 1) % num_elem + 1; + CeedElemRestrictionApplyAtPointsInElement(elem_restriction, i, CEED_NOTRANSPOSE, x, y, CEED_REQUEST_IMMEDIATE); + CeedVectorGetArrayRead(y, CEED_MEM_HOST, &read_array); - for (CeedInt k = 0; k < num_points_in_elem; k++) { - if (fabs(read_array[point_index] - (i == j ? 1.0 : 0.0)) > 10 * CEED_EPSILON) { - // LCOV_EXCL_START - printf("Error in restricted array x[%" CeedInt_FMT "] = %f\n", point_index, (CeedScalar)read_array[point_index]); - // LCOV_EXCL_STOP - } - point_index = (point_index + 1) % num_points; + for (CeedInt j = 0; j < num_points_in_elem; j++) { + if (i != read_array[j]) { + // LCOV_EXCL_START + printf("Error in restricted element array %" CeedInt_FMT " y[%" CeedInt_FMT "] = %f\n", i, j, (CeedScalar)read_array[j]); + // LCOV_EXCL_STOP } } - CeedVectorRestoreArrayRead(x, &read_array); + CeedVectorRestoreArrayRead(y, &read_array); } } diff --git a/tests/t234-elemrestriction.c b/tests/t234-elemrestriction.c new file mode 100644 index 0000000000..3f434bd365 --- /dev/null +++ b/tests/t234-elemrestriction.c @@ -0,0 +1,81 @@ +/// @file +/// Test creation, transpose use, and destruction of an element restriction at points for single elements +/// \test Test creation, transpose use, and destruction of an element restriction at points for single elements +#include +#include +#include + +int main(int argc, char **argv) { + Ceed ceed; + CeedInt num_elem = 3, num_points = num_elem * 2; + CeedInt ind[(num_elem + 1) + num_points]; + CeedVector x, y; + CeedElemRestriction elem_restriction; + + CeedInit(argv[1], &ceed); + + { + CeedInt offset = num_elem + 1; + CeedInt point_index = num_elem; + + for (CeedInt i = 0; i < num_elem; i++) { + CeedInt num_points_in_elem = (i + 1) % num_elem + 1; + + ind[i] = offset; + for (CeedInt j = 0; j < num_points_in_elem; j++) { + ind[offset + j] = point_index; + point_index = (point_index + 1) % num_points; + } + offset += num_points_in_elem; + } + ind[num_elem] = offset; + } + CeedElemRestrictionCreateAtPoints(ceed, num_elem, num_points, 1, num_points, CEED_MEM_HOST, CEED_COPY_VALUES, ind, &elem_restriction); + + CeedElemRestrictionCreateVector(elem_restriction, &x, NULL); + CeedVectorSetValue(x, 0.0); + { + CeedInt min_points, max_points; + + CeedElemRestrictionGetMinPointsInElement(elem_restriction, &min_points); + CeedElemRestrictionGetMaxPointsInElement(elem_restriction, &max_points); + if (min_points != 1 || max_points != num_elem) { + // LCOV_EXCL_START + printf("Error in min/max points: min %" CeedInt_FMT " max %" CeedInt_FMT "\n", min_points, max_points); + // LCOV_EXCL_STOP + } + CeedVectorCreate(ceed, max_points, &y); + CeedVectorSetValue(y, 1.0); + } + + { + for (CeedInt i = 0; i < num_elem; i++) { + CeedInt point_index = num_elem; + const CeedScalar *read_array; + + CeedVectorSetValue(x, 0.0); + CeedElemRestrictionApplyAtPointsInElement(elem_restriction, i, CEED_TRANSPOSE, y, x, CEED_REQUEST_IMMEDIATE); + + CeedVectorGetArrayRead(x, CEED_MEM_HOST, &read_array); + for (CeedInt j = 0; j < num_elem; j++) { + CeedInt num_points_in_elem = (j + 1) % num_elem + 1; + + for (CeedInt k = 0; k < num_points_in_elem; k++) { + if (fabs(read_array[point_index] - (i == j ? 1.0 : 0.0)) > 10 * CEED_EPSILON) { + // LCOV_EXCL_START + printf("Error in restricted array x[%" CeedInt_FMT "] = %f\n", point_index, (CeedScalar)read_array[point_index]); + // LCOV_EXCL_STOP + } + point_index = (point_index + 1) % num_points; + } + } + CeedVectorRestoreArrayRead(x, &read_array); + } + } + + CeedVectorDestroy(&x); + CeedVectorDestroy(&y); + CeedElemRestrictionDestroy(&elem_restriction); + CeedDestroy(&ceed); + return 0; +} diff --git a/tests/t300-basis-f.f90 b/tests/t300-basis-f.f90 index 1397a5a403..6aef5a0c34 100644 --- a/tests/t300-basis-f.f90 +++ b/tests/t300-basis-f.f90 @@ -18,7 +18,10 @@ program test call ceedbasiscreatetensorh1lagrange(ceed,1,1,4,4,ceed_gauss,b,err) call ceedbasisview(b,err) + call ceedbasissetnumviewtabs(b,1,err) + call ceedbasisview(b,err) call ceedbasisdestroy(b,err) + call ceeddestroy(ceed,err) end diff --git a/tests/t300-basis.c b/tests/t300-basis.c index db17332def..d340be94e3 100644 --- a/tests/t300-basis.c +++ b/tests/t300-basis.c @@ -18,8 +18,18 @@ int main(int argc, char **argv) { CeedBasisCreateTensorH1Lagrange(ceed, 1, 1, 4, 4, CEED_GAUSS, &basis); CeedBasisView(basis, stdout); - CeedBasisDestroy(&basis); + // Check tabs and CeedObject functionality + { + CeedBasis basis_copy = NULL; + + CeedBasisReferenceCopy(basis, &basis_copy); + CeedBasisSetNumViewTabs(basis_copy, 1); + CeedObjectView((CeedObject)basis_copy, stdout); + CeedObjectDestroy((CeedObject *)&basis_copy); + } + + CeedBasisDestroy(&basis); CeedDestroy(&ceed); return 0; } diff --git a/tests/t302-basis.c b/tests/t302-basis.c index 72623f400f..bfe94c8ef5 100644 --- a/tests/t302-basis.c +++ b/tests/t302-basis.c @@ -26,7 +26,7 @@ int main(int argc, char **argv) { if (fabs(collocated_gradient_1d[j + p * i] - gradient_1d[j + p * i]) > 100 * CEED_EPSILON) { // LCOV_EXCL_START printf("Error in collocated gradient %f != %f\n", collocated_gradient_1d[j + p * i], gradient_1d[j + p * i]); - // LCOV_EXCL_START + // LCOV_EXCL_STOP } } } diff --git a/tests/t303-basis.c b/tests/t303-basis.c index d71c97a6e7..baf844bf50 100644 --- a/tests/t303-basis.c +++ b/tests/t303-basis.c @@ -1,6 +1,6 @@ /// @file -/// Test checking BasisApply input/output vectors compatibility with basis dimensions -/// \test Test checking BasisApply input/output vectors compatibility with basis dimensions +/// Test checking BasisApply input/output vectors compatibility with basis +/// \test Test checking BasisApply input/output vectors compatibility with basis //TESTARGS(only="cpu") {ceed_resource} #include @@ -15,7 +15,7 @@ int main(int argc, char **argv) { CeedInit(argv[1], &ceed); CeedVectorCreate(ceed, len, &u); - CeedVectorCreate(ceed, len + 1, &v); + CeedVectorCreate(ceed, len - 1, &v); CeedBasisCreateTensorH1Lagrange(ceed, dim, num_comp, p, q, CEED_GAUSS, &basis); diff --git a/tests/t319-basis.c b/tests/t319-basis.c index 8e542ad1c6..c314cb2e82 100644 --- a/tests/t319-basis.c +++ b/tests/t319-basis.c @@ -1,6 +1,7 @@ /// @file /// Test projection interp and grad in multiple dimensions /// \test Test projection interp and grad in multiple dimensions +#include "t319-basis.h" #include #include #include @@ -34,6 +35,79 @@ static CeedScalar GetTolerance(CeedScalarType scalar_type, int dim) { return tol; } +static void VerifyProjectedBasis(CeedBasis basis_project, CeedInt dim, CeedInt p_to_dim, CeedInt p_from_dim, CeedVector x_to, CeedVector x_from, + CeedVector u_to, CeedVector u_from, CeedVector du_to) { + CeedScalar tol; + + { + CeedScalarType scalar_type; + + CeedGetScalarType(&scalar_type); + tol = GetTolerance(scalar_type, dim); + } + + // Setup coarse solution + { + const CeedScalar *x_array; + CeedScalar u_array[p_from_dim]; + + CeedVectorGetArrayRead(x_from, CEED_MEM_HOST, &x_array); + for (CeedInt i = 0; i < p_from_dim; i++) { + CeedScalar coord[dim]; + for (CeedInt d = 0; d < dim; d++) coord[d] = x_array[p_from_dim * d + i]; + u_array[i] = Eval(dim, coord); + } + CeedVectorRestoreArrayRead(x_from, &x_array); + CeedVectorSetArray(u_from, CEED_MEM_HOST, CEED_COPY_VALUES, u_array); + } + + // Project to fine basis + CeedBasisApply(basis_project, 1, CEED_NOTRANSPOSE, CEED_EVAL_INTERP, u_from, u_to); + + // Check solution + { + const CeedScalar *x_array, *u_array; + + CeedVectorGetArrayRead(x_to, CEED_MEM_HOST, &x_array); + CeedVectorGetArrayRead(u_to, CEED_MEM_HOST, &u_array); + for (CeedInt i = 0; i < p_to_dim; i++) { + CeedScalar coord[dim]; + for (CeedInt d = 0; d < dim; d++) coord[d] = x_array[d * p_to_dim + i]; + const CeedScalar u = Eval(dim, coord); + if (fabs(u - u_array[i]) > tol) printf("[%" CeedInt_FMT ", %" CeedInt_FMT "] %f != %f\n", dim, i, u_array[i], u); + } + CeedVectorRestoreArrayRead(x_to, &x_array); + CeedVectorRestoreArrayRead(u_to, &u_array); + } + + // Project and take gradient + CeedBasisApply(basis_project, 1, CEED_NOTRANSPOSE, CEED_EVAL_GRAD, u_from, du_to); + + // Check solution + { + const CeedScalar *x_array, *du_array; + + CeedVectorGetArrayRead(x_to, CEED_MEM_HOST, &x_array); + CeedVectorGetArrayRead(du_to, CEED_MEM_HOST, &du_array); + for (CeedInt i = 0; i < p_to_dim; i++) { + CeedScalar coord[dim]; + + for (CeedInt d = 0; d < dim; d++) coord[d] = x_array[p_to_dim * d + i]; + for (CeedInt d = 0; d < dim; d++) { + const CeedScalar du = EvalGrad(d, coord); + + if (fabs(du - du_array[p_to_dim * d + i]) > tol) { + // LCOV_EXCL_START + printf("[%" CeedInt_FMT ", %" CeedInt_FMT ", %" CeedInt_FMT "] %f != %f\n", dim, i, d, du_array[p_to_dim * (dim - 1 - d) + i], du); + // LCOV_EXCL_STOP + } + } + } + CeedVectorRestoreArrayRead(x_to, &x_array); + CeedVectorRestoreArrayRead(du_to, &du_array); + } +} + int main(int argc, char **argv) { Ceed ceed; @@ -42,15 +116,8 @@ int main(int argc, char **argv) { for (CeedInt dim = 1; dim <= 3; dim++) { CeedVector x_corners, x_from, x_to, u_from, u_to, du_to; CeedBasis basis_x, basis_from, basis_to, basis_project; - CeedInt p_from = 5, p_to = 6, q = 7, x_dim = CeedIntPow(2, dim), p_from_dim = CeedIntPow(p_from, dim), p_to_dim = CeedIntPow(p_to, dim); - CeedScalar tol; + CeedInt p_from = 4, p_to = 5, q = 6, x_dim = CeedIntPow(2, dim), p_from_dim = CeedIntPow(p_from, dim), p_to_dim = CeedIntPow(p_to, dim); - { - CeedScalarType scalar_type; - - CeedGetScalarType(&scalar_type); - tol = GetTolerance(scalar_type, dim); - } CeedVectorCreate(ceed, x_dim * dim, &x_corners); { CeedScalar x_array[x_dim * dim]; @@ -82,66 +149,46 @@ int main(int argc, char **argv) { CeedBasisCreateTensorH1Lagrange(ceed, dim, 1, p_to, q, CEED_GAUSS, &basis_to); CeedBasisCreateProjection(basis_from, basis_to, &basis_project); - // Setup coarse solution - { - const CeedScalar *x_array; - CeedScalar u_array[p_from_dim]; - - CeedVectorGetArrayRead(x_from, CEED_MEM_HOST, &x_array); - for (CeedInt i = 0; i < p_from_dim; i++) { - CeedScalar coord[dim]; - for (CeedInt d = 0; d < dim; d++) coord[d] = x_array[p_from_dim * d + i]; - u_array[i] = Eval(dim, coord); - } - CeedVectorRestoreArrayRead(x_from, &x_array); - CeedVectorSetArray(u_from, CEED_MEM_HOST, CEED_COPY_VALUES, u_array); - } - - // Project to fine basis - CeedBasisApply(basis_project, 1, CEED_NOTRANSPOSE, CEED_EVAL_INTERP, u_from, u_to); + VerifyProjectedBasis(basis_project, dim, p_to_dim, p_from_dim, x_to, x_from, u_to, u_from, du_to); - // Check solution + // Create non-tensor bases + CeedBasis basis_from_nontensor, basis_to_nontensor; { - const CeedScalar *x_array, *u_array; - - CeedVectorGetArrayRead(x_to, CEED_MEM_HOST, &x_array); - CeedVectorGetArrayRead(u_to, CEED_MEM_HOST, &u_array); - for (CeedInt i = 0; i < p_to_dim; i++) { - CeedScalar coord[dim]; - for (CeedInt d = 0; d < dim; d++) coord[d] = x_array[d * p_to_dim + i]; - const CeedScalar u = Eval(dim, coord); - if (fabs(u - u_array[i]) > tol) printf("[%" CeedInt_FMT ", %" CeedInt_FMT "] %f != %f\n", dim, i, u_array[i], u); - } - CeedVectorRestoreArrayRead(x_to, &x_array); - CeedVectorRestoreArrayRead(u_to, &u_array); + CeedElemTopology topo; + CeedInt num_comp, num_nodes, num_qpts; + const CeedScalar *interp, *grad; + + CeedBasisGetTopology(basis_from, &topo); + CeedBasisGetNumComponents(basis_from, &num_comp); + CeedBasisGetNumNodes(basis_from, &num_nodes); + CeedBasisGetNumQuadraturePoints(basis_from, &num_qpts); + CeedBasisGetInterp(basis_from, &interp); + CeedBasisGetGrad(basis_from, &grad); + CeedBasisCreateH1(ceed, topo, num_comp, num_nodes, num_qpts, interp, grad, NULL, NULL, &basis_from_nontensor); + + CeedBasisGetTopology(basis_to, &topo); + CeedBasisGetNumComponents(basis_to, &num_comp); + CeedBasisGetNumNodes(basis_to, &num_nodes); + CeedBasisGetNumQuadraturePoints(basis_to, &num_qpts); + CeedBasisGetInterp(basis_to, &interp); + CeedBasisGetGrad(basis_to, &grad); + CeedBasisCreateH1(ceed, topo, num_comp, num_nodes, num_qpts, interp, grad, NULL, NULL, &basis_to_nontensor); } - // Project and take gradient - CeedBasisApply(basis_project, 1, CEED_NOTRANSPOSE, CEED_EVAL_GRAD, u_from, du_to); + // Test projection on non-tensor bases + CeedBasisDestroy(&basis_project); + CeedBasisCreateProjection(basis_from_nontensor, basis_to_nontensor, &basis_project); + VerifyProjectedBasis(basis_project, dim, p_to_dim, p_from_dim, x_to, x_from, u_to, u_from, du_to); - // Check solution - { - const CeedScalar *x_array, *du_array; - - CeedVectorGetArrayRead(x_to, CEED_MEM_HOST, &x_array); - CeedVectorGetArrayRead(du_to, CEED_MEM_HOST, &du_array); - for (CeedInt i = 0; i < p_to_dim; i++) { - CeedScalar coord[dim]; - - for (CeedInt d = 0; d < dim; d++) coord[d] = x_array[p_to_dim * d + i]; - for (CeedInt d = 0; d < dim; d++) { - const CeedScalar du = EvalGrad(d, coord); - - if (fabs(du - du_array[p_to_dim * d + i]) > tol) { - // LCOV_EXCL_START - printf("[%" CeedInt_FMT ", %" CeedInt_FMT ", %" CeedInt_FMT "] %f != %f\n", dim, i, d, du_array[p_to_dim * (dim - 1 - d) + i], du); - // LCOV_EXCL_STOP - } - } - } - CeedVectorRestoreArrayRead(x_to, &x_array); - CeedVectorRestoreArrayRead(du_to, &du_array); - } + // Test projection from non-tensor to tensor + CeedBasisDestroy(&basis_project); + CeedBasisCreateProjection(basis_from_nontensor, basis_to, &basis_project); + VerifyProjectedBasis(basis_project, dim, p_to_dim, p_from_dim, x_to, x_from, u_to, u_from, du_to); + + // Test projection from tensor to non-tensor + CeedBasisDestroy(&basis_project); + CeedBasisCreateProjection(basis_from, basis_to_nontensor, &basis_project); + VerifyProjectedBasis(basis_project, dim, p_to_dim, p_from_dim, x_to, x_from, u_to, u_from, du_to); CeedVectorDestroy(&x_corners); CeedVectorDestroy(&x_from); @@ -150,9 +197,52 @@ int main(int argc, char **argv) { CeedVectorDestroy(&u_to); CeedVectorDestroy(&du_to); CeedBasisDestroy(&basis_from); + CeedBasisDestroy(&basis_from_nontensor); CeedBasisDestroy(&basis_to); + CeedBasisDestroy(&basis_to_nontensor); CeedBasisDestroy(&basis_project); } + + // Test projection between basis of different topological dimension + { + CeedInt face_dim = 2, P_1D = 2; + CeedBasis basis_face, basis_cell_to_face, basis_proj; + + CeedScalar *q_ref = NULL, *q_weights = NULL; + const CeedScalar *grad, *interp; + CeedInt P, Q; + GetCellToFaceTabulation(CEED_GAUSS, &P, &Q, &interp, &grad); + + CeedBasisCreateTensorH1Lagrange(ceed, face_dim, 1, 2, P_1D, CEED_GAUSS, &basis_face); + CeedBasisCreateH1(ceed, CEED_TOPOLOGY_HEX, 1, P, Q, (CeedScalar *)interp, (CeedScalar *)grad, q_ref, q_weights, &basis_cell_to_face); + CeedBasisCreateProjection(basis_cell_to_face, basis_face, &basis_proj); + const CeedScalar *interp_proj, *grad_proj, *interp_proj_ref, *grad_proj_ref; + + GetCellToFaceTabulation(CEED_GAUSS_LOBATTO, NULL, NULL, &interp_proj_ref, &grad_proj_ref); + CeedBasisGetInterp(basis_proj, &interp_proj); + CeedBasisGetGrad(basis_proj, &grad_proj); + CeedScalar tol = 100 * CEED_EPSILON; + + for (CeedInt i = 0; i < 4 * 8; i++) { + if (fabs(interp_proj[i] - ((CeedScalar *)interp_proj_ref)[i]) > tol) { + // LCOV_EXCL_START + printf("Mixed Topology Projection: interp[%" CeedInt_FMT "] expected %f, got %f\n", i, interp_proj[i], ((CeedScalar *)interp_proj_ref)[i]); + // LCOV_EXCL_STOP + } + } + + for (CeedInt i = 0; i < 3 * 4 * 8; i++) { + if (fabs(grad_proj[i] - ((CeedScalar *)grad_proj_ref)[i]) > tol) { + // LCOV_EXCL_START + printf("Mixed Topology Projection: grad[%" CeedInt_FMT "] expected %f, got %f\n", i, grad_proj[i], ((CeedScalar *)grad_proj_ref)[i]); + // LCOV_EXCL_STOP + } + } + + CeedBasisDestroy(&basis_face); + CeedBasisDestroy(&basis_cell_to_face); + CeedBasisDestroy(&basis_proj); + } CeedDestroy(&ceed); return 0; } diff --git a/tests/t319-basis.h b/tests/t319-basis.h new file mode 100644 index 0000000000..965a7fcd0c --- /dev/null +++ b/tests/t319-basis.h @@ -0,0 +1,72 @@ +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. +// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +// +// SPDX-License-Identifier: BSD-2-Clause +// +// This file is part of CEED: http://github.com/ceed + +#include + +// Interpolation matrices for cell-to-face of Q1 hexahedral element onto it's "5" face (in PETSc) +// Nodes are at Gauss-Lobatto points and quadrature points are Gauss, all over [-1,1] domain range +const CeedScalar Q1_interp_gauss[4][8] = { + {0.62200846792814612, 0, 0.16666666666666669, 0, 0.16666666666666669, 0, 0.044658198738520463, 0}, + {0.16666666666666669, 0, 0.62200846792814612, 0, 0.044658198738520463, 0, 0.16666666666666669, 0}, + {0.16666666666666669, 0, 0.044658198738520463, 0, 0.62200846792814612, 0, 0.16666666666666669, 0}, + {0.044658198738520463, 0, 0.16666666666666669, 0, 0.16666666666666669, 0, 0.62200846792814612, 0} +}; +const CeedScalar Q1_grad_gauss[3][4][8] = { + {{-0.31100423396407312, 0.31100423396407312, -0.083333333333333343, 0.083333333333333343, -0.083333333333333343, 0.083333333333333343, + -0.022329099369260232, 0.022329099369260232}, + {-0.083333333333333343, 0.083333333333333343, -0.31100423396407312, 0.31100423396407312, -0.022329099369260232, 0.022329099369260232, + -0.083333333333333343, 0.083333333333333343}, + {-0.083333333333333343, 0.083333333333333343, -0.022329099369260232, 0.022329099369260232, -0.31100423396407312, 0.31100423396407312, + -0.083333333333333343, 0.083333333333333343}, + {-0.022329099369260232, 0.022329099369260232, -0.083333333333333343, 0.083333333333333343, -0.083333333333333343, 0.083333333333333343, + -0.31100423396407312, 0.31100423396407312} }, + {{-0.39433756729740643, 0, 0.39433756729740643, 0, -0.10566243270259357, 0, 0.10566243270259357, 0}, + {-0.39433756729740643, 0, 0.39433756729740643, 0, -0.10566243270259357, 0, 0.10566243270259357, 0}, + {-0.10566243270259357, 0, 0.10566243270259357, 0, -0.39433756729740643, 0, 0.39433756729740643, 0}, + {-0.10566243270259357, 0, 0.10566243270259357, 0, -0.39433756729740643, 0, 0.39433756729740643, 0}}, + {{-0.39433756729740643, 0, -0.10566243270259357, 0, 0.39433756729740643, 0, 0.10566243270259357, 0}, + {-0.10566243270259357, 0, -0.39433756729740643, 0, 0.10566243270259357, 0, 0.39433756729740643, 0}, + {-0.39433756729740643, 0, -0.10566243270259357, 0, 0.39433756729740643, 0, 0.10566243270259357, 0}, + {-0.10566243270259357, 0, -0.39433756729740643, 0, 0.10566243270259357, 0, 0.39433756729740643, 0}} +}; + +const CeedScalar Q1_interp_gauss_lobatto[4][8] = { + {1, 0, 0, 0, 0, 0, 0, 0}, + {0, 0, 1, 0, 0, 0, 0, 0}, + {0, 0, 0, 0, 1, 0, 0, 0}, + {0, 0, 0, 0, 0, 0, 1, 0} +}; +/* clang-format off */ +const CeedScalar Q1_grad_gauss_lobatto[3][4][8] = { + {{-0.5, 0.5, 0, 0, 0, 0, 0, 0}, + {0, 0, -0.5, 0.5, 0, 0, 0, 0}, + {0, 0, 0, 0, -0.5, 0.5, 0, 0}, + {0, 0, 0, 0, 0, 0, -0.5, 0.5}}, + {{-0.5, 0, 0.5, 0, 0, 0, 0, 0}, + {-0.5, 0, 0.5, 0, 0, 0, 0, 0}, + {0, 0, 0, 0, -0.5, 0, 0.5, 0}, + {0, 0, 0, 0, -0.5, 0, 0.5, 0}}, + {{-0.5, 0, 0, 0, 0.5, 0, 0, 0}, + {0, 0, -0.5, 0, 0, 0, 0.5, 0}, + {-0.5, 0, 0, 0, 0.5, 0, 0, 0}, + {0, 0, -0.5, 0, 0, 0, 0.5, 0}} +}; +/* clang-format on */ + +static void GetCellToFaceTabulation(CeedQuadMode quad_mode, CeedInt *P, CeedInt *Q, const CeedScalar **interp, const CeedScalar **grad) { + if (P) *P = 8; + if (Q) *Q = 4; + + if (quad_mode == CEED_GAUSS) { + *interp = (const CeedScalar *)Q1_interp_gauss; + *grad = (const CeedScalar *)Q1_grad_gauss; + } + if (quad_mode == CEED_GAUSS_LOBATTO) { + *interp = (const CeedScalar *)Q1_interp_gauss_lobatto; + *grad = (const CeedScalar *)Q1_grad_gauss_lobatto; + } +} diff --git a/tests/t320-basis-f.f90 b/tests/t320-basis-f.f90 index 46dffdede5..c8eb67fee8 100644 --- a/tests/t320-basis-f.f90 +++ b/tests/t320-basis-f.f90 @@ -32,6 +32,8 @@ program test call ceedbasiscreateh1(ceed,ceed_triangle,1,p,q,interp,grad,qref,qweight,& & b,err) call ceedbasisview(b,err) + call ceedbasissetnumviewtabs(b,1,err) + call ceedbasisview(b,err) call ceedbasisdestroy(b,err) call ceeddestroy(ceed,err) diff --git a/tests/t320-basis-f.h b/tests/t320-basis-f.h index 93129e54de..84e7486a10 100644 --- a/tests/t320-basis-f.h +++ b/tests/t320-basis-f.h @@ -1,4 +1,4 @@ -! Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +! Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. ! All Rights Reserved. See the top-level LICENSE and NOTICE files for details. ! ! SPDX-License-Identifier: BSD-2-Clause diff --git a/tests/t320-basis.c b/tests/t320-basis.c index c028fcd0a5..20309ec1ed 100644 --- a/tests/t320-basis.c +++ b/tests/t320-basis.c @@ -20,6 +20,8 @@ int main(int argc, char **argv) { Build2DSimplex(q_ref, q_weight, interp, grad); CeedBasisCreateH1(ceed, CEED_TOPOLOGY_TRIANGLE, 1, p, q, interp, grad, q_ref, q_weight, &basis); CeedBasisView(basis, stdout); + CeedBasisSetNumViewTabs(basis, 1); + CeedBasisView(basis, stdout); CeedBasisDestroy(&basis); CeedDestroy(&ceed); diff --git a/tests/t320-basis.h b/tests/t320-basis.h index ef38e43b0a..30f8e824d0 100644 --- a/tests/t320-basis.h +++ b/tests/t320-basis.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause diff --git a/tests/t330-basis.c b/tests/t330-basis.c index dfbf3373ff..bd96afd40d 100644 --- a/tests/t330-basis.c +++ b/tests/t330-basis.c @@ -21,6 +21,8 @@ int main(int argc, char **argv) { BuildHdivQuadrilateral(q, q_ref, q_weights, interp, div, CEED_GAUSS); CeedBasisCreateHdiv(ceed, CEED_TOPOLOGY_QUAD, 1, p, num_qpts, interp, div, q_ref, q_weights, &basis); CeedBasisView(basis, stdout); + CeedBasisSetNumViewTabs(basis, 1); + CeedBasisView(basis, stdout); CeedBasisDestroy(&basis); CeedDestroy(&ceed); diff --git a/tests/t330-basis.h b/tests/t330-basis.h index 82ae5a3d81..b75bd421b9 100644 --- a/tests/t330-basis.h +++ b/tests/t330-basis.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause diff --git a/tests/t340-basis.c b/tests/t340-basis.c index e9af85ff5f..8a70269bec 100644 --- a/tests/t340-basis.c +++ b/tests/t340-basis.c @@ -20,6 +20,8 @@ int main(int argc, char **argv) { BuildHcurl2DSimplex(q_ref, q_weight, interp, curl); CeedBasisCreateHcurl(ceed, CEED_TOPOLOGY_TRIANGLE, 1, p, q, interp, curl, q_ref, q_weight, &basis); CeedBasisView(basis, stdout); + CeedBasisSetNumViewTabs(basis, 1); + CeedBasisView(basis, stdout); CeedBasisDestroy(&basis); CeedDestroy(&ceed); diff --git a/tests/t340-basis.h b/tests/t340-basis.h index 5fd8c420bc..90aef60f15 100644 --- a/tests/t340-basis.h +++ b/tests/t340-basis.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause diff --git a/tests/t350-basis.c b/tests/t350-basis.c index 54979bb9a0..becc0d98ea 100644 --- a/tests/t350-basis.c +++ b/tests/t350-basis.c @@ -56,7 +56,7 @@ int main(int argc, char **argv) { CeedVectorSetArray(x_points, CEED_MEM_HOST, CEED_COPY_VALUES, x_array); } - CeedBasisApplyAtPoints(basis_u, num_points, CEED_NOTRANSPOSE, CEED_EVAL_INTERP, x_points, u, v); + CeedBasisApplyAtPoints(basis_u, 1, &num_points, CEED_NOTRANSPOSE, CEED_EVAL_INTERP, x_points, u, v); { const CeedScalar *x_array, *v_array; diff --git a/tests/t351-basis.c b/tests/t351-basis.c index 14b23730e1..84f59cc838 100644 --- a/tests/t351-basis.c +++ b/tests/t351-basis.c @@ -65,7 +65,7 @@ int main(int argc, char **argv) { CeedVectorSetArray(x_points, CEED_MEM_HOST, CEED_COPY_VALUES, x_array); } - CeedBasisApplyAtPoints(basis_u, num_points, CEED_NOTRANSPOSE, CEED_EVAL_INTERP, x_points, u, v); + CeedBasisApplyAtPoints(basis_u, 1, &num_points, CEED_NOTRANSPOSE, CEED_EVAL_INTERP, x_points, u, v); { const CeedScalar *x_array, *v_array; diff --git a/tests/t352-basis.c b/tests/t352-basis.c index a4bf13d8b6..c2da0e2dd4 100644 --- a/tests/t352-basis.c +++ b/tests/t352-basis.c @@ -65,7 +65,7 @@ int main(int argc, char **argv) { CeedVectorSetArray(x_points, CEED_MEM_HOST, CEED_COPY_VALUES, x_array); } - CeedBasisApplyAtPoints(basis_u, num_points, CEED_NOTRANSPOSE, CEED_EVAL_INTERP, x_points, u, v); + CeedBasisApplyAtPoints(basis_u, 1, &num_points, CEED_NOTRANSPOSE, CEED_EVAL_INTERP, x_points, u, v); { const CeedScalar *x_array, *v_array; diff --git a/tests/t353-basis.c b/tests/t353-basis.c index 22f80ddcdd..83fd16adb0 100644 --- a/tests/t353-basis.c +++ b/tests/t353-basis.c @@ -60,17 +60,18 @@ int main(int argc, char **argv) { CeedVectorSetArray(x_points, CEED_MEM_HOST, CEED_COPY_VALUES, x_array); } - CeedBasisApplyAtPoints(basis_u, num_points, CEED_NOTRANSPOSE, CEED_EVAL_INTERP, x_points, u, v); + CeedBasisApplyAtPoints(basis_u, 1, &num_points, CEED_NOTRANSPOSE, CEED_EVAL_INTERP, x_points, u, v); for (CeedInt i = 0; i < num_points; i++) { - CeedScalar fx = 0.0; + const CeedInt num_point[1] = {1}; + CeedScalar fx = 0.0; const CeedScalar *x_array, *u_array, *v_array, *u_point_array; CeedVectorGetArrayRead(x_points, CEED_MEM_HOST, &x_array); CeedVectorGetArrayRead(u, CEED_MEM_HOST, &u_array); CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array); CeedVectorSetValue(x_point, x_array[i]); - CeedBasisApplyAtPoints(basis_u, 1, CEED_TRANSPOSE, CEED_EVAL_INTERP, x_point, v_point, u_point); + CeedBasisApplyAtPoints(basis_u, 1, num_point, CEED_TRANSPOSE, CEED_EVAL_INTERP, x_point, v_point, u_point); CeedVectorGetArrayRead(u_point, CEED_MEM_HOST, &u_point_array); for (CeedInt j = 0; j < p; j++) fx += u_array[j] * u_point_array[j]; if (fabs(v_array[i] - fx) > 100. * CEED_EPSILON) printf("%f != %f = f(%f)\n", v_array[i], fx, x_array[i]); diff --git a/tests/t354-basis.c b/tests/t354-basis.c index 85f0ac2293..4d3402b257 100644 --- a/tests/t354-basis.c +++ b/tests/t354-basis.c @@ -1,6 +1,6 @@ /// @file -/// Test polynomial interpolation to arbitrary points in multiple dimensions -/// \test Test polynomial interpolation to arbitrary points in multiple dimensions +/// Test polynomial interpolation transpose to arbitrary points in multiple dimensions +/// \test Test polynomial interpolation transpose to arbitrary points in multiple dimensions #include #include #include @@ -69,10 +69,11 @@ int main(int argc, char **argv) { CeedVectorSetArray(x_points, CEED_MEM_HOST, CEED_COPY_VALUES, x_array); } - CeedBasisApplyAtPoints(basis_u, num_points, CEED_NOTRANSPOSE, CEED_EVAL_INTERP, x_points, u, v); + CeedBasisApplyAtPoints(basis_u, 1, &num_points, CEED_NOTRANSPOSE, CEED_EVAL_INTERP, x_points, u, v); for (CeedInt i = 0; i < num_points; i++) { - CeedScalar fx = 0.0; + const CeedInt num_point[1] = {1}; + CeedScalar fx = 0.0; CeedScalar coord[dim]; const CeedScalar *x_array, *u_array, *v_array, *u_point_array; @@ -81,10 +82,10 @@ int main(int argc, char **argv) { CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array); for (CeedInt d = 0; d < dim; d++) coord[d] = x_array[d * num_points + i]; CeedVectorSetArray(x_point, CEED_MEM_HOST, CEED_COPY_VALUES, coord); - CeedBasisApplyAtPoints(basis_u, 1, CEED_TRANSPOSE, CEED_EVAL_INTERP, x_point, v_point, u_point); + CeedBasisApplyAtPoints(basis_u, 1, num_point, CEED_TRANSPOSE, CEED_EVAL_INTERP, x_point, v_point, u_point); CeedVectorGetArrayRead(u_point, CEED_MEM_HOST, &u_point_array); for (CeedInt j = 0; j < p_dim; j++) fx += u_array[j] * u_point_array[j]; - if (fabs(v_array[i] - fx) > 100. * CEED_EPSILON) { + if (fabs(v_array[i] - fx) > 500. * CEED_EPSILON) { // LCOV_EXCL_START printf("[%" CeedInt_FMT "] %f != %f = f(%f", dim, v_array[i], fx, coord[0]); for (CeedInt d = 1; d < dim; d++) printf(", %f", coord[d]); diff --git a/tests/t355-basis.c b/tests/t355-basis.c index 7fd7906dcb..5b93764a7a 100644 --- a/tests/t355-basis.c +++ b/tests/t355-basis.c @@ -62,7 +62,7 @@ int main(int argc, char **argv) { CeedVectorSetArray(x_points, CEED_MEM_HOST, CEED_COPY_VALUES, x_array); } - CeedBasisApplyAtPoints(basis_u, num_points, CEED_NOTRANSPOSE, CEED_EVAL_GRAD, x_points, u, v); + CeedBasisApplyAtPoints(basis_u, 1, &num_points, CEED_NOTRANSPOSE, CEED_EVAL_GRAD, x_points, u, v); { const CeedScalar *x_array, *v_array; diff --git a/tests/t356-basis.c b/tests/t356-basis.c index 8eb3c57e7c..263cc43b66 100644 --- a/tests/t356-basis.c +++ b/tests/t356-basis.c @@ -75,7 +75,7 @@ int main(int argc, char **argv) { CeedVectorSetArray(x_points, CEED_MEM_HOST, CEED_COPY_VALUES, x_array); } - CeedBasisApplyAtPoints(basis_u, num_points, CEED_NOTRANSPOSE, CEED_EVAL_GRAD, x_points, u, v); + CeedBasisApplyAtPoints(basis_u, 1, &num_points, CEED_NOTRANSPOSE, CEED_EVAL_GRAD, x_points, u, v); { const CeedScalar *x_array, *v_array; diff --git a/tests/t357-basis.c b/tests/t357-basis.c index ecfa56476c..0f4e105a66 100644 --- a/tests/t357-basis.c +++ b/tests/t357-basis.c @@ -82,8 +82,8 @@ int main(int argc, char **argv) { } // Calculate G u at arbitrary points, G' * 1 at dofs - CeedBasisApplyAtPoints(basis_u, num_points, CEED_NOTRANSPOSE, CEED_EVAL_GRAD, x_points, u, u_points); - CeedBasisApplyAtPoints(basis_u, num_points, CEED_TRANSPOSE, CEED_EVAL_GRAD, x_points, ones, v); + CeedBasisApplyAtPoints(basis_u, 1, &num_points, CEED_NOTRANSPOSE, CEED_EVAL_GRAD, x_points, u, u_points); + CeedBasisApplyAtPoints(basis_u, 1, &num_points, CEED_TRANSPOSE, CEED_EVAL_GRAD, x_points, ones, v); { const CeedScalar *u_array, *v_array, *u_points_array; diff --git a/tests/t360-basis.c b/tests/t360-basis.c new file mode 100644 index 0000000000..f953157e1c --- /dev/null +++ b/tests/t360-basis.c @@ -0,0 +1,56 @@ +/// @file +/// Test interpolation ApplyAdd in multiple dimensions +/// \test Test interpolation ApplyAdd in multiple dimensions +#include +#include +#include + +int main(int argc, char **argv) { + Ceed ceed; + + CeedInit(argv[1], &ceed); + + for (CeedInt dim = 1; dim <= 3; dim++) { + CeedVector u, u_q, v, v_q, w_q; + CeedBasis basis; + CeedInt p = 4, q = 5, p_dim = CeedIntPow(p, dim), q_dim = CeedIntPow(q, dim); + + CeedVectorCreate(ceed, p_dim, &u); + CeedVectorCreate(ceed, p_dim, &v); + CeedVectorSetValue(u, 1.0); + CeedVectorSetValue(v, 0.0); + CeedVectorCreate(ceed, q_dim, &u_q); + CeedVectorCreate(ceed, q_dim, &v_q); + CeedVectorCreate(ceed, q_dim, &w_q); + + CeedBasisCreateTensorH1Lagrange(ceed, dim, 1, p, q, CEED_GAUSS, &basis); + + // Compute area + CeedBasisApply(basis, 1, CEED_NOTRANSPOSE, CEED_EVAL_INTERP, u, u_q); + CeedBasisApply(basis, 1, CEED_NOTRANSPOSE, CEED_EVAL_WEIGHT, CEED_VECTOR_NONE, w_q); + CeedVectorPointwiseMult(v_q, u_q, w_q); + CeedBasisApply(basis, 1, CEED_TRANSPOSE, CEED_EVAL_INTERP, v_q, v); + // Double area computed + CeedBasisApplyAdd(basis, 1, CEED_TRANSPOSE, CEED_EVAL_INTERP, v_q, v); + + // Check area + { + const CeedScalar *v_array; + CeedScalar area = 0.0; + + CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array); + for (CeedInt i = 0; i < p_dim; i++) area += v_array[i]; + if (fabs(area - 2.0 * CeedIntPow(2, dim)) > 5E-6) printf("Incorrect area computed %f != %f\n", area, 2.0 * CeedIntPow(2, dim)); + CeedVectorRestoreArrayRead(v, &v_array); + } + + CeedVectorDestroy(&u); + CeedVectorDestroy(&v); + CeedVectorDestroy(&u_q); + CeedVectorDestroy(&v_q); + CeedVectorDestroy(&w_q); + CeedBasisDestroy(&basis); + } + CeedDestroy(&ceed); + return 0; +} diff --git a/tests/t361-basis.c b/tests/t361-basis.c new file mode 100644 index 0000000000..6671a39ae5 --- /dev/null +++ b/tests/t361-basis.c @@ -0,0 +1,116 @@ +/// @file +/// Test grad ApplyAdd in multiple dimensions +/// \test Test grad ApplyAdd in multiple dimensions +#include +#include +#include + +static CeedScalar Eval(CeedInt dim, const CeedScalar x[]) { + CeedScalar result = tanh(x[0] + 0.1); + if (dim > 1) result += atan(x[1] + 0.2); + if (dim > 2) result += exp(-(x[2] + 0.3) * (x[2] + 0.3)); + return result; +} + +static CeedScalar GetTolerance(CeedScalarType scalar_type, int dim) { + CeedScalar tol; + if (scalar_type == CEED_SCALAR_FP32) { + if (dim == 3) tol = 0.05; + else tol = 1.e-3; + } else { + tol = 1.e-10; + } + return 2.0 * tol; +} + +int main(int argc, char **argv) { + Ceed ceed; + + CeedInit(argv[1], &ceed); + + for (CeedInt dim = 1; dim <= 3; dim++) { + CeedVector x, x_q, u, u_q, ones, v; + CeedBasis basis_x_lobatto, basis_u_gauss; + CeedInt p = 8, q = 10, p_dim = CeedIntPow(p, dim), q_dim = CeedIntPow(q, dim), x_dim = CeedIntPow(2, dim); + CeedScalar sum_1 = 0, sum_2 = 0; + + CeedVectorCreate(ceed, x_dim * dim, &x); + { + CeedScalar x_array[x_dim * dim]; + + for (CeedInt d = 0; d < dim; d++) { + for (CeedInt i = 0; i < x_dim; i++) x_array[d * x_dim + i] = (i % CeedIntPow(2, d + 1)) / CeedIntPow(2, d) ? 1 : -1; + } + CeedVectorSetArray(x, CEED_MEM_HOST, CEED_COPY_VALUES, x_array); + } + CeedVectorCreate(ceed, p_dim * dim, &x_q); + CeedVectorSetValue(x_q, 0); + CeedVectorCreate(ceed, p_dim, &u); + CeedVectorCreate(ceed, q_dim * dim, &u_q); + CeedVectorSetValue(u_q, 0); + CeedVectorCreate(ceed, q_dim * dim, &ones); + CeedVectorSetValue(ones, 1); + CeedVectorCreate(ceed, p_dim, &v); + CeedVectorSetValue(v, 0); + + // Get function values at quadrature points + CeedBasisCreateTensorH1Lagrange(ceed, dim, dim, 2, p, CEED_GAUSS_LOBATTO, &basis_x_lobatto); + CeedBasisApply(basis_x_lobatto, 1, CEED_NOTRANSPOSE, CEED_EVAL_INTERP, x, x_q); + + { + const CeedScalar *x_q_array; + CeedScalar u_array[p_dim]; + + CeedVectorGetArrayRead(x_q, CEED_MEM_HOST, &x_q_array); + for (CeedInt i = 0; i < p_dim; i++) { + CeedScalar coord[dim]; + + for (CeedInt d = 0; d < dim; d++) coord[d] = x_q_array[d * p_dim + i]; + u_array[i] = Eval(dim, coord); + } + CeedVectorRestoreArrayRead(x_q, &x_q_array); + CeedVectorSetArray(u, CEED_MEM_HOST, CEED_COPY_VALUES, u_array); + } + + // Calculate G u at quadrature points, G' * 1 at dofs + CeedBasisCreateTensorH1Lagrange(ceed, dim, 1, p, q, CEED_GAUSS, &basis_u_gauss); + CeedBasisApply(basis_u_gauss, 1, CEED_NOTRANSPOSE, CEED_EVAL_GRAD, u, u_q); + CeedVectorScale(u_q, 2.0); + CeedBasisApply(basis_u_gauss, 1, CEED_TRANSPOSE, CEED_EVAL_GRAD, ones, v); + CeedBasisApplyAdd(basis_u_gauss, 1, CEED_TRANSPOSE, CEED_EVAL_GRAD, ones, v); + + // Check if 1' * G * u = u' * (G' * 1) + { + const CeedScalar *v_array, *u_array, *u_q_array; + + CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array); + CeedVectorGetArrayRead(u, CEED_MEM_HOST, &u_array); + CeedVectorGetArrayRead(u_q, CEED_MEM_HOST, &u_q_array); + for (CeedInt i = 0; i < p_dim; i++) sum_1 += v_array[i] * u_array[i]; + for (CeedInt i = 0; i < dim * q_dim; i++) sum_2 += u_q_array[i]; + CeedVectorRestoreArrayRead(v, &v_array); + CeedVectorRestoreArrayRead(u, &u_array); + CeedVectorRestoreArrayRead(u_q, &u_q_array); + } + { + CeedScalarType scalar_type; + + CeedGetScalarType(&scalar_type); + + CeedScalar tol = GetTolerance(scalar_type, dim); + + if (fabs(sum_1 - sum_2) > tol) printf("[%" CeedInt_FMT "] %0.12f != %0.12f\n", dim, sum_1, sum_2); + } + + CeedVectorDestroy(&x); + CeedVectorDestroy(&x_q); + CeedVectorDestroy(&u); + CeedVectorDestroy(&u_q); + CeedVectorDestroy(&ones); + CeedVectorDestroy(&v); + CeedBasisDestroy(&basis_x_lobatto); + CeedBasisDestroy(&basis_u_gauss); + } + CeedDestroy(&ceed); + return 0; +} diff --git a/tests/t362-basis.c b/tests/t362-basis.c new file mode 100644 index 0000000000..bff1937d66 --- /dev/null +++ b/tests/t362-basis.c @@ -0,0 +1,59 @@ +/// @file +/// Test integration ApplyAdd with a 2D Simplex non-tensor H^1 basis +/// \test Test integration ApplyAdd with a 2D Simplex non-tensor H^1 basis +#include +#include +#include + +#include "t320-basis.h" + +// main test +int main(int argc, char **argv) { + Ceed ceed; + CeedVector u, v, u_q, v_q, w_q; + const CeedInt p = 6, q = 4, dim = 2; + CeedBasis basis; + CeedScalar q_ref[dim * q], q_weight[q]; + CeedScalar interp[p * q], grad[dim * p * q]; + + CeedInit(argv[1], &ceed); + + CeedVectorCreate(ceed, p, &u); + CeedVectorCreate(ceed, p, &v); + CeedVectorSetValue(u, 1.0); + CeedVectorSetValue(v, 0.0); + CeedVectorCreate(ceed, q, &u_q); + CeedVectorCreate(ceed, q, &v_q); + CeedVectorCreate(ceed, q, &w_q); + + Build2DSimplex(q_ref, q_weight, interp, grad); + CeedBasisCreateH1(ceed, CEED_TOPOLOGY_TRIANGLE, 1, p, q, interp, grad, q_ref, q_weight, &basis); + + // Compute area + CeedBasisApply(basis, 1, CEED_NOTRANSPOSE, CEED_EVAL_INTERP, u, u_q); + CeedBasisApply(basis, 1, CEED_NOTRANSPOSE, CEED_EVAL_WEIGHT, CEED_VECTOR_NONE, w_q); + CeedVectorPointwiseMult(v_q, u_q, w_q); + CeedBasisApply(basis, 1, CEED_TRANSPOSE, CEED_EVAL_INTERP, v_q, v); + // Double area computed + CeedBasisApplyAdd(basis, 1, CEED_TRANSPOSE, CEED_EVAL_INTERP, v_q, v); + + // Check area + { + const CeedScalar *v_array; + CeedScalar area = 0.0; + + CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array); + for (CeedInt i = 0; i < p; i++) area += v_array[i]; + if (fabs(area - 1.0) > 1E-6) printf("Incorrect area computed %f != %f\n", area, 1.0); + CeedVectorRestoreArrayRead(v, &v_array); + } + + CeedVectorDestroy(&u); + CeedVectorDestroy(&v); + CeedVectorDestroy(&u_q); + CeedVectorDestroy(&v_q); + CeedVectorDestroy(&w_q); + CeedBasisDestroy(&basis); + CeedDestroy(&ceed); + return 0; +} diff --git a/tests/t363-basis.c b/tests/t363-basis.c new file mode 100644 index 0000000000..6c19f34027 --- /dev/null +++ b/tests/t363-basis.c @@ -0,0 +1,54 @@ +/// @file +/// Test grad transpose ApplyAdd with a 2D Simplex non-tensor H^1 basis +/// \test Test grad transpose ApplyAdd with a 2D Simplex non-tensor H^1 basis +#include +#include +#include + +#include "t320-basis.h" + +int main(int argc, char **argv) { + Ceed ceed; + CeedVector u, v; + const CeedInt p = 6, q = 4, dim = 2; + CeedBasis basis; + CeedScalar q_ref[dim * q], q_weight[q]; + CeedScalar interp[p * q], grad[dim * p * q]; + CeedScalar column_sum[p]; + + CeedInit(argv[1], &ceed); + + CeedVectorCreate(ceed, q * dim, &u); + CeedVectorSetValue(u, 1); + CeedVectorCreate(ceed, p, &v); + CeedVectorSetValue(v, 0); + + Build2DSimplex(q_ref, q_weight, interp, grad); + CeedBasisCreateH1(ceed, CEED_TOPOLOGY_TRIANGLE, 1, p, q, interp, grad, q_ref, q_weight, &basis); + + CeedBasisApply(basis, 1, CEED_TRANSPOSE, CEED_EVAL_GRAD, u, v); + CeedBasisApplyAdd(basis, 1, CEED_TRANSPOSE, CEED_EVAL_GRAD, u, v); + + // Check values at quadrature points + for (int i = 0; i < p; i++) { + column_sum[i] = 0; + for (int j = 0; j < q * dim; j++) { + column_sum[i] += grad[i + j * p]; + } + } + { + const CeedScalar *v_array; + + CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array); + for (int i = 0; i < p; i++) { + if (fabs(column_sum[i] - v_array[i] / 2.0) > 100. * CEED_EPSILON) printf("[%" CeedInt_FMT "] %f != %f\n", i, v_array[i] / 2.0, column_sum[i]); + } + CeedVectorRestoreArrayRead(v, &v_array); + } + + CeedVectorDestroy(&u); + CeedVectorDestroy(&v); + CeedBasisDestroy(&basis); + CeedDestroy(&ceed); + return 0; +} diff --git a/tests/t364-basis.c b/tests/t364-basis.c new file mode 100644 index 0000000000..6ab4058d30 --- /dev/null +++ b/tests/t364-basis.c @@ -0,0 +1,98 @@ +/// @file +/// Test polynomial interpolation transpose ApplyAdd from arbitrary points in 1D +/// \test Test polynomial interpolation transpose ApplyAdd from arbitrary points in 1D +#include +#include +#include + +#define ALEN(a) (sizeof(a) / sizeof((a)[0])) + +static CeedScalar Eval(CeedScalar x, CeedInt n, const CeedScalar *c) { + CeedScalar y = c[n - 1]; + for (CeedInt i = n - 2; i >= 0; i--) y = y * x + c[i]; + return y; +} + +int main(int argc, char **argv) { + Ceed ceed; + CeedVector x, x_nodes, x_points, x_point, u, v, u_point, v_point; + CeedBasis basis_x, basis_u; + const CeedInt p = 5, q = 5, num_points = 4; + const CeedScalar c[4] = {1, 2, 3, 4}; // 1 + 2x + 3x^2 + ... + + CeedInit(argv[1], &ceed); + + CeedVectorCreate(ceed, 2, &x); + CeedVectorCreate(ceed, p, &x_nodes); + CeedVectorCreate(ceed, num_points, &x_points); + CeedVectorCreate(ceed, 1, &x_point); + CeedVectorCreate(ceed, p, &u); + CeedVectorCreate(ceed, num_points, &v); + CeedVectorCreate(ceed, p, &u_point); + CeedVectorCreate(ceed, 1, &v_point); + CeedVectorSetValue(v_point, 1.0); + + // Get nodal coordinates + CeedBasisCreateTensorH1Lagrange(ceed, 1, 1, 2, p, CEED_GAUSS_LOBATTO, &basis_x); + { + CeedScalar x_array[2]; + + for (CeedInt i = 0; i < 2; i++) x_array[i] = CeedIntPow(-1, i + 1); + CeedVectorSetArray(x, CEED_MEM_HOST, CEED_COPY_VALUES, x_array); + } + CeedBasisApply(basis_x, 1, CEED_NOTRANSPOSE, CEED_EVAL_INTERP, x, x_nodes); + + // Set values of u at nodes + { + const CeedScalar *x_array; + CeedScalar u_array[p]; + + CeedVectorGetArrayRead(x_nodes, CEED_MEM_HOST, &x_array); + for (CeedInt i = 0; i < p; i++) u_array[i] = Eval(x_array[i], ALEN(c), c); + CeedVectorRestoreArrayRead(x_nodes, &x_array); + CeedVectorSetArray(u, CEED_MEM_HOST, CEED_COPY_VALUES, (CeedScalar *)&u_array); + } + + // Interpolate to arbitrary points + CeedBasisCreateTensorH1Lagrange(ceed, 1, 1, p, q, CEED_GAUSS, &basis_u); + { + CeedScalar x_array[4] = {-0.33, -0.65, 0.16, 0.99}; + + CeedVectorSetArray(x_points, CEED_MEM_HOST, CEED_COPY_VALUES, x_array); + } + CeedBasisApplyAtPoints(basis_u, 1, &num_points, CEED_NOTRANSPOSE, CEED_EVAL_INTERP, x_points, u, v); + + for (CeedInt i = 0; i < num_points; i++) { + const CeedInt num_point[1] = {1}; + CeedScalar fx = 0.0; + const CeedScalar *x_array, *u_array, *v_array, *u_point_array; + + CeedVectorGetArrayRead(x_points, CEED_MEM_HOST, &x_array); + CeedVectorGetArrayRead(u, CEED_MEM_HOST, &u_array); + CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array); + CeedVectorSetValue(x_point, x_array[i]); + CeedBasisApplyAtPoints(basis_u, 1, num_point, CEED_TRANSPOSE, CEED_EVAL_INTERP, x_point, v_point, u_point); + // Double it + CeedBasisApplyAddAtPoints(basis_u, 1, num_point, CEED_TRANSPOSE, CEED_EVAL_INTERP, x_point, v_point, u_point); + CeedVectorGetArrayRead(u_point, CEED_MEM_HOST, &u_point_array); + for (CeedInt j = 0; j < p; j++) fx += u_array[j] * u_point_array[j]; + if (fabs(v_array[i] * 2.0 - fx) > 100. * CEED_EPSILON) printf("%f != %f = f(%f)\n", v_array[i] * 2.0, fx, x_array[i]); + CeedVectorRestoreArrayRead(u_point, &u_point_array); + CeedVectorRestoreArrayRead(x_points, &x_array); + CeedVectorRestoreArrayRead(u, &u_array); + CeedVectorRestoreArrayRead(v, &v_array); + } + + CeedVectorDestroy(&x); + CeedVectorDestroy(&x_nodes); + CeedVectorDestroy(&x_points); + CeedVectorDestroy(&x_point); + CeedVectorDestroy(&u); + CeedVectorDestroy(&v); + CeedVectorDestroy(&u_point); + CeedVectorDestroy(&v_point); + CeedBasisDestroy(&basis_x); + CeedBasisDestroy(&basis_u); + CeedDestroy(&ceed); + return 0; +} diff --git a/tests/t365-basis.c b/tests/t365-basis.c new file mode 100644 index 0000000000..74f93ce881 --- /dev/null +++ b/tests/t365-basis.c @@ -0,0 +1,123 @@ +/// @file +/// Test gradient transpose ApplyAdd in multiple dimensions at arbitrary points +/// \test Test gradient transpose ApplyAdd in multiple dimensions at arbitrary points +#include +#include +#include + +static CeedScalar Eval(CeedInt dim, const CeedScalar x[]) { + CeedScalar result = tanh(x[0] + 0.1); + if (dim > 1) result += atan(x[1] + 0.2); + if (dim > 2) result += exp(-(x[2] + 0.3) * (x[2] + 0.3)); + return result; +} + +static CeedScalar GetTolerance(CeedScalarType scalar_type, int dim) { + CeedScalar tol; + if (scalar_type == CEED_SCALAR_FP32) { + if (dim == 3) tol = 0.005; + else tol = 1.e-4; + } else { + tol = 1.e-11; + } + return tol; +} + +int main(int argc, char **argv) { + Ceed ceed; + + CeedInit(argv[1], &ceed); + + for (CeedInt dim = 1; dim <= 3; dim++) { + CeedVector x, x_nodes, x_points, u, u_points, v, ones; + CeedBasis basis_x, basis_u; + const CeedInt p = 9, q = 9, num_points = 4, x_dim = CeedIntPow(2, dim), p_dim = CeedIntPow(p, dim); + CeedScalar sum_1 = 0, sum_2 = 0; + + CeedVectorCreate(ceed, x_dim * dim, &x); + CeedVectorCreate(ceed, p_dim * dim, &x_nodes); + CeedVectorCreate(ceed, num_points * dim, &x_points); + CeedVectorCreate(ceed, p_dim, &u); + CeedVectorCreate(ceed, num_points * dim, &u_points); + CeedVectorCreate(ceed, p_dim, &v); + CeedVectorCreate(ceed, num_points * dim, &ones); + + CeedVectorSetValue(ones, 1); + CeedVectorSetValue(v, 0); + + // Get nodal coordinates + CeedBasisCreateTensorH1Lagrange(ceed, dim, dim, 2, p, CEED_GAUSS_LOBATTO, &basis_x); + { + CeedScalar x_array[x_dim * dim]; + + for (CeedInt d = 0; d < dim; d++) { + for (CeedInt i = 0; i < x_dim; i++) x_array[d * x_dim + i] = (i % CeedIntPow(2, d + 1)) / CeedIntPow(2, d) ? 1 : -1; + } + CeedVectorSetArray(x, CEED_MEM_HOST, CEED_COPY_VALUES, x_array); + } + CeedBasisApply(basis_x, 1, CEED_NOTRANSPOSE, CEED_EVAL_INTERP, x, x_nodes); + + // Set values of u at nodes + { + const CeedScalar *x_array; + CeedScalar u_array[p_dim]; + + CeedVectorGetArrayRead(x_nodes, CEED_MEM_HOST, &x_array); + for (CeedInt i = 0; i < p_dim; i++) { + CeedScalar coord[dim]; + + for (CeedInt d = 0; d < dim; d++) coord[d] = x_array[d * p_dim + i]; + u_array[i] = Eval(dim, coord); + } + CeedVectorRestoreArrayRead(x_nodes, &x_array); + CeedVectorSetArray(u, CEED_MEM_HOST, CEED_COPY_VALUES, (CeedScalar *)&u_array); + } + + // Interpolate to arbitrary points + CeedBasisCreateTensorH1Lagrange(ceed, dim, 1, p, q, CEED_GAUSS, &basis_u); + { + CeedScalar x_array[12] = {-0.33, -0.65, 0.16, 0.99, -0.65, 0.16, 0.99, -0.33, 0.16, 0.99, -0.33, -0.65}; + + CeedVectorSetArray(x_points, CEED_MEM_HOST, CEED_COPY_VALUES, x_array); + } + + // Calculate G u at arbitrary points, G' * 1 at dofs + CeedBasisApplyAtPoints(basis_u, 1, &num_points, CEED_NOTRANSPOSE, CEED_EVAL_GRAD, x_points, u, u_points); + CeedBasisApplyAtPoints(basis_u, 1, &num_points, CEED_TRANSPOSE, CEED_EVAL_GRAD, x_points, ones, v); + // Double it + CeedBasisApplyAddAtPoints(basis_u, 1, &num_points, CEED_TRANSPOSE, CEED_EVAL_GRAD, x_points, ones, v); + { + const CeedScalar *u_array, *v_array, *u_points_array; + + CeedVectorGetArrayRead(u, CEED_MEM_HOST, &u_array); + CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array); + CeedVectorGetArrayRead(u_points, CEED_MEM_HOST, &u_points_array); + for (CeedInt i = 0; i < p_dim; i++) sum_1 += v_array[i] * u_array[i]; + for (CeedInt i = 0; i < num_points * dim; i++) sum_2 += u_points_array[i]; + CeedVectorRestoreArrayRead(u, &u_array); + CeedVectorRestoreArrayRead(v, &v_array); + CeedVectorRestoreArrayRead(u_points, &u_points_array); + } + { + CeedScalarType scalar_type; + + CeedGetScalarType(&scalar_type); + + CeedScalar tol = GetTolerance(scalar_type, dim); + + if (fabs(sum_1 - 2.0 * sum_2) > tol) printf("[%" CeedInt_FMT "] %f != %f\n", dim, sum_1, 2.0 * sum_2); + } + + CeedVectorDestroy(&x); + CeedVectorDestroy(&x_nodes); + CeedVectorDestroy(&x_points); + CeedVectorDestroy(&u); + CeedVectorDestroy(&u_points); + CeedVectorDestroy(&ones); + CeedVectorDestroy(&v); + CeedBasisDestroy(&basis_x); + CeedBasisDestroy(&basis_u); + } + CeedDestroy(&ceed); + return 0; +} diff --git a/tests/t400-qfunction.h b/tests/t400-qfunction.h index 1fb64842fd..740c7da030 100644 --- a/tests/t400-qfunction.h +++ b/tests/t400-qfunction.h @@ -1,11 +1,11 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause // // This file is part of CEED: http://github.com/ceed -#include +#include CEED_QFUNCTION(setup)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { const CeedScalar *w = in[0]; diff --git a/tests/t401-qfunction.h b/tests/t401-qfunction.h index c61cdb8ac6..f91dae701c 100644 --- a/tests/t401-qfunction.h +++ b/tests/t401-qfunction.h @@ -1,11 +1,11 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause // // This file is part of CEED: http://github.com/ceed -#include +#include CEED_QFUNCTION(setup)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { const CeedScalar *w = in[0]; diff --git a/tests/t402-qfunction-f.f90 b/tests/t402-qfunction-f.f90 index e87bcc4f2b..45cac092ce 100644 --- a/tests/t402-qfunction-f.f90 +++ b/tests/t402-qfunction-f.f90 @@ -49,6 +49,11 @@ program test & ctxdata,coffset,err) call ceedqfunctioncontextview(ctx,err) + call ceedqfunctionsetnumviewtabs(qf_mass,1,err) + call ceedqfunctionview(qf_mass,err) + call ceedqfunctioncontextsetnumviewtabs(ctx,1,err) + call ceedqfunctioncontextview(ctx,err) + call ceedqfunctiondestroy(qf_setup,err) call ceedqfunctiondestroy(qf_mass,err) call ceeddestroy(ceed,err) diff --git a/tests/t402-qfunction.c b/tests/t402-qfunction.c index 2f80666e15..6f24d492f0 100644 --- a/tests/t402-qfunction.c +++ b/tests/t402-qfunction.c @@ -34,6 +34,22 @@ int main(int argc, char **argv) { } CeedQFunctionContextView(ctx, stdout); + // Check tabs and CeedObject functionality + { + CeedQFunction qf_mass_copy = NULL; + CeedQFunctionContext ctx_copy = NULL; + + CeedQFunctionReferenceCopy(qf_mass, &qf_mass_copy); + CeedQFunctionSetNumViewTabs(qf_mass_copy, 1); + CeedObjectView((CeedObject)qf_mass_copy, stdout); + CeedObjectDestroy((CeedObject *)&qf_mass_copy); + + CeedQFunctionContextReferenceCopy(ctx, &ctx_copy); + CeedQFunctionContextSetNumViewTabs(ctx_copy, 1); + CeedObjectView((CeedObject)ctx_copy, stdout); + CeedObjectDestroy((CeedObject *)&ctx_copy); + } + CeedQFunctionDestroy(&qf_setup); CeedQFunctionDestroy(&qf_mass); CeedQFunctionContextDestroy(&ctx); diff --git a/tests/t405-qfunction.h b/tests/t405-qfunction.h index eaf261791f..0c356c943e 100644 --- a/tests/t405-qfunction.h +++ b/tests/t405-qfunction.h @@ -1,11 +1,11 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause // // This file is part of CEED: http://github.com/ceed -#include +#include CEED_QFUNCTION(setup)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { const CeedScalar *w = in[0]; diff --git a/tests/t406-qfunction-helper.h b/tests/t406-qfunction-helper.h index 9db4901023..0410b00af6 100644 --- a/tests/t406-qfunction-helper.h +++ b/tests/t406-qfunction-helper.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause @@ -10,11 +10,15 @@ # pragma once // clang-format on +// Note - ceed/types.h should be used over ceed.h #include // Test include path with "/./" #include "./t406-qfunction-scales.h" -CEED_QFUNCTION_HELPER CeedScalar times_two(CeedScalar x) { return SCALE_TWO * x; } +// Test include via -I.... +#include -CEED_QFUNCTION_HELPER CeedScalar times_three(CeedScalar x) { return SCALE_THREE * x; } +CEED_QFUNCTION_HELPER CeedScalar times_two(CeedScalar x) { return FAKE_SYS_SCALE_ONE * SCALE_TWO * x; } + +CEED_QFUNCTION_HELPER CeedScalar times_three(CeedScalar x) { return FAKE_SYS_SCALE_ONE * SCALE_THREE * x; } diff --git a/tests/t406-qfunction-scales.h b/tests/t406-qfunction-scales.h index cde93275ff..7dc42e93c6 100644 --- a/tests/t406-qfunction-scales.h +++ b/tests/t406-qfunction-scales.h @@ -3,7 +3,7 @@ // Testing # on first line // Note: #ifndef and #pragma once header guards both work -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause diff --git a/tests/t406-qfunction.c b/tests/t406-qfunction.c index 201c3782a4..d593f8f73d 100644 --- a/tests/t406-qfunction.c +++ b/tests/t406-qfunction.c @@ -18,6 +18,14 @@ int main(int argc, char **argv) { CeedScalar v_true[q]; CeedInit(argv[1], &ceed); + { + char file_path[2056] = __FILE__; + char *last_slash = strrchr(file_path, '/'); + + memcpy(&file_path[last_slash - file_path], "/test-include/", 15); + CeedAddJitSourceRoot(ceed, file_path); + CeedAddJitDefine(ceed, "COMPILER_DEFINED_SCALE=42"); + } CeedVectorCreate(ceed, q, &w); CeedVectorCreate(ceed, q, &u); @@ -64,9 +72,9 @@ int main(int argc, char **argv) { CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array); for (CeedInt i = 0; i < q; i++) { - if (fabs(5 * v_true[i] * sqrt(2.) - v_array[i]) > 1E3 * CEED_EPSILON) { + if (fabs(5 * COMPILER_DEFINED_SCALE * v_true[i] * sqrt(2.) - v_array[i]) > 5E3 * CEED_EPSILON) { // LCOV_EXCL_START - printf("[%" CeedInt_FMT "] v_true %f != v %f\n", i, 5 * v_true[i] * sqrt(2.), v_array[i]); + printf("[%" CeedInt_FMT "] v_true %f != v %f\n", i, 5 * COMPILER_DEFINED_SCALE * v_true[i] * sqrt(2.), v_array[i]); // LCOV_EXCL_STOP } } diff --git a/tests/t406-qfunction.h b/tests/t406-qfunction.h index f4782f7029..617e6b7875 100644 --- a/tests/t406-qfunction.h +++ b/tests/t406-qfunction.h @@ -1,24 +1,33 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause // // This file is part of CEED: http://github.com/ceed -// Note: intentionally testing strange spacing in '#include's +// Note: intentionally testing strange spacing in include's // clang-format off +// Note - ceed/types.h should be used over ceed.h #include -# include +// Note - system headers like math.h and std*.h should be guarded +#ifndef CEED_RUNNING_JIT_PASS +# include +#endif #include "t406-qfunction-helper.h" // Test duplicate includes of guarded files // Also test include path with "/../" #include "../tests/t406-qfunction-helper.h" // Also test include path with "/../../" -#include "../../libCEED/tests/t406-qfunction-helper.h" +#include "output/../../tests/t406-qfunction-helper.h" # include "t406-qfunction-scales.h" // clang-format on +// Extra define set via CeedAddJitDefine() during JiT +#ifndef CEED_RUNNING_JIT_PASS +#define COMPILER_DEFINED_SCALE 42 +#endif + CEED_QFUNCTION(setup)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { const CeedScalar *w = in[0]; CeedScalar *q_data = out[0]; @@ -32,7 +41,7 @@ CEED_QFUNCTION(mass)(void *ctx, const CeedInt Q, const CeedScalar *const *in, Ce const CeedScalar *q_data = in[0], *u = in[1]; CeedScalar *v = out[0]; for (CeedInt i = 0; i < Q; i++) { - v[i] = q_data[i] * (times_two(u[i]) + times_three(u[i])) * sqrt(1.0 * SCALE_TWO); + v[i] = q_data[i] * COMPILER_DEFINED_SCALE * (times_two(u[i]) + times_three(u[i])) * sqrt(1.0 * SCALE_TWO); } return 0; } diff --git a/tests/t409-qfunction.c b/tests/t409-qfunction.c index 44029cc797..5f17a5614a 100644 --- a/tests/t409-qfunction.c +++ b/tests/t409-qfunction.c @@ -74,6 +74,7 @@ int main(int argc, char **argv) { CeedQFunctionContextRestoreData(ctx, &ctx_data_new); is_writable = false; CeedQFunctionSetContextWritable(qf, is_writable); + { in[0] = u; out[0] = v; diff --git a/tests/t409-qfunction.h b/tests/t409-qfunction.h index 27e2c6585e..b2f59a9f80 100644 --- a/tests/t409-qfunction.h +++ b/tests/t409-qfunction.h @@ -1,11 +1,11 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause // // This file is part of CEED: http://github.com/ceed -#include +#include CEED_QFUNCTION(scale)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { CeedScalar *scale = (CeedScalar *)ctx; diff --git a/tests/t413-qfunction-f.f90 b/tests/t413-qfunction-f.f90 index c6a43d7361..754d881a4d 100644 --- a/tests/t413-qfunction-f.f90 +++ b/tests/t413-qfunction-f.f90 @@ -15,6 +15,8 @@ program test call ceedqfunctionview(qf_setup,err) call ceedqfunctionview(qf_mass,err) + call ceedqfunctionsetnumviewtabs(qf_mass,1,err) + call ceedqfunctionview(qf_mass,err) call ceedqfunctiondestroy(qf_setup,err) call ceedqfunctiondestroy(qf_mass,err) diff --git a/tests/t413-qfunction.c b/tests/t413-qfunction.c index 690502ae76..aeecdd639f 100644 --- a/tests/t413-qfunction.c +++ b/tests/t413-qfunction.c @@ -14,6 +14,8 @@ int main(int argc, char **argv) { CeedQFunctionView(qf_setup, stdout); CeedQFunctionView(qf_mass, stdout); + CeedQFunctionSetNumViewTabs(qf_mass, 1); + CeedQFunctionView(qf_mass, stdout); CeedQFunctionDestroy(&qf_setup); CeedQFunctionDestroy(&qf_mass); diff --git a/tests/t500-operator.h b/tests/t500-operator.h index de9ca8966a..935d077208 100644 --- a/tests/t500-operator.h +++ b/tests/t500-operator.h @@ -1,11 +1,11 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause // // This file is part of CEED: http://github.com/ceed -#include +#include CEED_QFUNCTION(setup)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { const CeedScalar *weight = in[0], *dxdX = in[1]; diff --git a/tests/t502-operator.h b/tests/t502-operator.h index 9d343b5ab9..fab809d8db 100644 --- a/tests/t502-operator.h +++ b/tests/t502-operator.h @@ -1,11 +1,11 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause // // This file is part of CEED: http://github.com/ceed -#include +#include CEED_QFUNCTION(setup)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { const CeedScalar *weight = in[0], *dxdX = in[1]; diff --git a/tests/t504-operator-f.f90 b/tests/t504-operator-f.f90 index ccc87614bc..beedfda264 100644 --- a/tests/t504-operator-f.f90 +++ b/tests/t504-operator-f.f90 @@ -92,7 +92,10 @@ program test call ceedoperatorsetfield(op_mass,'v',erestrictu,bu,& & ceed_vector_active,err) + call ceedoperatorsetname(op_setup,'setup',err) call ceedoperatorview(op_setup,err) + call ceedoperatorsetname(op_mass,'mass',err) + call ceedoperatorsetnumviewtabs(op_mass,1,err) call ceedoperatorview(op_mass,err) call ceedvectordestroy(qdata,err) diff --git a/tests/t504-operator.c b/tests/t504-operator.c index ce5e7bb0c2..41dfcc7962 100644 --- a/tests/t504-operator.c +++ b/tests/t504-operator.c @@ -66,8 +66,21 @@ int main(int argc, char **argv) { CeedOperatorSetField(op_mass, "u", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE); CeedOperatorSetField(op_mass, "v", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE); + CeedOperatorSetName(op_setup, "setup"); + CeedOperatorViewTerse(op_setup, stdout); CeedOperatorView(op_setup, stdout); - CeedOperatorView(op_mass, stdout); + + // Check tabs and CeedObject functionality + { + CeedOperator op_mass_copy = NULL; + + CeedOperatorReferenceCopy(op_mass, &op_mass_copy); + CeedOperatorSetName(op_mass_copy, "mass"); + CeedOperatorSetNumViewTabs(op_mass_copy, 1); + CeedOperatorViewTerse(op_mass_copy, stdout); + CeedObjectView((CeedObject)op_mass_copy, stdout); + CeedObjectDestroy((CeedObject *)&op_mass_copy); + } CeedVectorDestroy(&q_data); CeedElemRestrictionDestroy(&elem_restriction_u); diff --git a/tests/t507-operator.h b/tests/t507-operator.h index 5d245534be..312500b35f 100644 --- a/tests/t507-operator.h +++ b/tests/t507-operator.h @@ -1,11 +1,11 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause // // This file is part of CEED: http://github.com/ceed -#include +#include CEED_QFUNCTION(setup)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { const CeedScalar *weight = in[0], *dxdX = in[1]; diff --git a/tests/t510-operator.h b/tests/t510-operator.h index 01cf47450c..171f9d01df 100644 --- a/tests/t510-operator.h +++ b/tests/t510-operator.h @@ -1,11 +1,11 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause // // This file is part of CEED: http://github.com/ceed -#include +#include CEED_QFUNCTION(setup)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { const CeedScalar *weight = in[0], *J = in[1]; diff --git a/tests/t520-operator-f.f90 b/tests/t520-operator-f.f90 index 628ce3735c..3882ddc1e0 100644 --- a/tests/t520-operator-f.f90 +++ b/tests/t520-operator-f.f90 @@ -211,13 +211,13 @@ program test & buhex,ceed_vector_active,err) ! Composite Operators - call ceedcompositeoperatorcreate(ceed,op_setup,err) - call ceedcompositeoperatoraddsub(op_setup,op_setuptet,err) - call ceedcompositeoperatoraddsub(op_setup,op_setuphex,err) + call ceedoperatorcreatecomposite(ceed,op_setup,err) + call ceedoperatorcompositeaddsub(op_setup,op_setuptet,err) + call ceedoperatorcompositeaddsub(op_setup,op_setuphex,err) - call ceedcompositeoperatorcreate(ceed,op_mass,err) - call ceedcompositeoperatoraddsub(op_mass,op_masstet,err) - call ceedcompositeoperatoraddsub(op_mass,op_masshex,err) + call ceedoperatorcreatecomposite(ceed,op_mass,err) + call ceedoperatorcompositeaddsub(op_mass,op_masstet,err) + call ceedoperatorcompositeaddsub(op_mass,op_masshex,err) ! Apply Setup Operator call ceedoperatorapply(op_setup,x,ceed_vector_none,& diff --git a/tests/t520-operator.c b/tests/t520-operator.c index 9035234ffa..5632b3e2d6 100644 --- a/tests/t520-operator.c +++ b/tests/t520-operator.c @@ -111,6 +111,7 @@ int main(int argc, char **argv) { CeedOperatorSetField(op_mass_tet, "rho", elem_restriction_q_data_tet, CEED_BASIS_NONE, q_data_tet); CeedOperatorSetField(op_mass_tet, "u", elem_restriction_u_tet, basis_u_tet, CEED_VECTOR_ACTIVE); CeedOperatorSetField(op_mass_tet, "v", elem_restriction_u_tet, basis_u_tet, CEED_VECTOR_ACTIVE); + CeedOperatorSetName(op_mass_tet, "mass tet"); // Set up Hex Elements // -- Restrictions @@ -154,19 +155,30 @@ int main(int argc, char **argv) { CeedOperatorSetField(op_mass_hex, "rho", elem_restriction_q_data_hex, CEED_BASIS_NONE, q_data_hex); CeedOperatorSetField(op_mass_hex, "u", elem_restriction_u_hex, basis_u_hex, CEED_VECTOR_ACTIVE); CeedOperatorSetField(op_mass_hex, "v", elem_restriction_u_hex, basis_u_hex, CEED_VECTOR_ACTIVE); + CeedOperatorSetName(op_mass_hex, "mass hex"); // Set up Composite Operators // -- Create - CeedCompositeOperatorCreate(ceed, &op_setup); + CeedOperatorCreateComposite(ceed, &op_setup); // -- Add SubOperators - CeedCompositeOperatorAddSub(op_setup, op_setup_tet); - CeedCompositeOperatorAddSub(op_setup, op_setup_hex); + CeedOperatorCompositeAddSub(op_setup, op_setup_tet); + CeedOperatorCompositeAddSub(op_setup, op_setup_hex); // -- Create - CeedCompositeOperatorCreate(ceed, &op_mass); + CeedOperatorCreateComposite(ceed, &op_mass); // -- Add SubOperators - CeedCompositeOperatorAddSub(op_mass, op_mass_tet); - CeedCompositeOperatorAddSub(op_mass, op_mass_hex); + CeedOperatorCompositeAddSub(op_mass, op_mass_tet); + CeedOperatorCompositeAddSub(op_mass, op_mass_hex); + + { // Test CeedOperatorCompositeGetSubByName + CeedOperator op_byname; + + CeedOperatorCompositeGetSubByName(op_mass, "mass hex", &op_byname); + if (op_byname != op_mass_hex) printf("CeedOperatorCompositeGetSubByName returned incorrect Sub Operator"); + + CeedOperatorCompositeGetSubByName(op_mass, "asdf", &op_byname); + if (op_byname != NULL) printf("CeedOperatorCompositeGetSubByName returned non-NULL for non-existent Sub Operator"); + } // Apply Setup Operator CeedOperatorApply(op_setup, x, CEED_VECTOR_NONE, CEED_REQUEST_IMMEDIATE); diff --git a/tests/t521-operator-f.f90 b/tests/t521-operator-f.f90 index 20ab09eb7b..73fff92d7a 100644 --- a/tests/t521-operator-f.f90 +++ b/tests/t521-operator-f.f90 @@ -213,13 +213,13 @@ program test & buhex,ceed_vector_active,err) ! Composite Operators - call ceedcompositeoperatorcreate(ceed,op_setup,err) - call ceedcompositeoperatoraddsub(op_setup,op_setuptet,err) - call ceedcompositeoperatoraddsub(op_setup,op_setuphex,err) + call ceedoperatorcreatecomposite(ceed,op_setup,err) + call ceedoperatorcompositeaddsub(op_setup,op_setuptet,err) + call ceedoperatorcompositeaddsub(op_setup,op_setuphex,err) - call ceedcompositeoperatorcreate(ceed,op_mass,err) - call ceedcompositeoperatoraddsub(op_mass,op_masstet,err) - call ceedcompositeoperatoraddsub(op_mass,op_masshex,err) + call ceedoperatorcreatecomposite(ceed,op_mass,err) + call ceedoperatorcompositeaddsub(op_mass,op_masstet,err) + call ceedoperatorcompositeaddsub(op_mass,op_masshex,err) ! Apply Setup Operator call ceedoperatorapply(op_setup,x,ceed_vector_none,& diff --git a/tests/t521-operator.c b/tests/t521-operator.c index 1fff943186..dd13ea5589 100644 --- a/tests/t521-operator.c +++ b/tests/t521-operator.c @@ -156,13 +156,13 @@ int main(int argc, char **argv) { CeedOperatorSetField(op_mass_hex, "v", elem_restriction_u_hex, basis_u_hex, CEED_VECTOR_ACTIVE); // Composite Operators - CeedCompositeOperatorCreate(ceed, &op_setup); - CeedCompositeOperatorAddSub(op_setup, op_setup_tet); - CeedCompositeOperatorAddSub(op_setup, op_setup_hex); + CeedOperatorCreateComposite(ceed, &op_setup); + CeedOperatorCompositeAddSub(op_setup, op_setup_tet); + CeedOperatorCompositeAddSub(op_setup, op_setup_hex); - CeedCompositeOperatorCreate(ceed, &op_mass); - CeedCompositeOperatorAddSub(op_mass, op_mass_tet); - CeedCompositeOperatorAddSub(op_mass, op_mass_hex); + CeedOperatorCreateComposite(ceed, &op_mass); + CeedOperatorCompositeAddSub(op_mass, op_mass_tet); + CeedOperatorCompositeAddSub(op_mass, op_mass_hex); // Apply Setup Operator CeedOperatorApply(op_setup, x, CEED_VECTOR_NONE, CEED_REQUEST_IMMEDIATE); diff --git a/tests/t522-operator-f.f90 b/tests/t522-operator-f.f90 index 4ea3773f7b..98b9089edb 100644 --- a/tests/t522-operator-f.f90 +++ b/tests/t522-operator-f.f90 @@ -215,13 +215,13 @@ program test & buhex,ceed_vector_active,err) ! Composite Operators - call ceedcompositeoperatorcreate(ceed,op_setup,err) - call ceedcompositeoperatoraddsub(op_setup,op_setuptet,err) - call ceedcompositeoperatoraddsub(op_setup,op_setuphex,err) + call ceedoperatorcreatecomposite(ceed,op_setup,err) + call ceedoperatorcompositeaddsub(op_setup,op_setuptet,err) + call ceedoperatorcompositeaddsub(op_setup,op_setuphex,err) - call ceedcompositeoperatorcreate(ceed,op_diff,err) - call ceedcompositeoperatoraddsub(op_diff,op_difftet,err) - call ceedcompositeoperatoraddsub(op_diff,op_diffhex,err) + call ceedoperatorcreatecomposite(ceed,op_diff,err) + call ceedoperatorcompositeaddsub(op_diff,op_difftet,err) + call ceedoperatorcompositeaddsub(op_diff,op_diffhex,err) ! Apply Setup Operator call ceedoperatorapply(op_setup,x,ceed_vector_none,& diff --git a/tests/t522-operator.c b/tests/t522-operator.c index b2e1da90ac..8572c0d687 100644 --- a/tests/t522-operator.c +++ b/tests/t522-operator.c @@ -159,13 +159,13 @@ int main(int argc, char **argv) { CeedOperatorSetField(op_diff_hex, "v", elem_restriction_u_hex, basis_u_hex, CEED_VECTOR_ACTIVE); // Composite Operators - CeedCompositeOperatorCreate(ceed, &op_setup); - CeedCompositeOperatorAddSub(op_setup, op_setup_tet); - CeedCompositeOperatorAddSub(op_setup, op_setup_hex); + CeedOperatorCreateComposite(ceed, &op_setup); + CeedOperatorCompositeAddSub(op_setup, op_setup_tet); + CeedOperatorCompositeAddSub(op_setup, op_setup_hex); - CeedCompositeOperatorCreate(ceed, &op_diff); - CeedCompositeOperatorAddSub(op_diff, op_diff_tet); - CeedCompositeOperatorAddSub(op_diff, op_diff_hex); + CeedOperatorCreateComposite(ceed, &op_diff); + CeedOperatorCompositeAddSub(op_diff, op_diff_tet); + CeedOperatorCompositeAddSub(op_diff, op_diff_hex); // Apply Setup Operator CeedOperatorApply(op_setup, x, CEED_VECTOR_NONE, CEED_REQUEST_IMMEDIATE); diff --git a/tests/t522-operator.h b/tests/t522-operator.h index 3f70b7d354..b594818bd1 100644 --- a/tests/t522-operator.h +++ b/tests/t522-operator.h @@ -1,11 +1,11 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause // // This file is part of CEED: http://github.com/ceed -#include +#include CEED_QFUNCTION(setup)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { const CeedScalar *qw = in[0], *J = in[1]; diff --git a/tests/t523-operator-f.f90 b/tests/t523-operator-f.f90 index 0431b60ee7..fcd4504fe5 100644 --- a/tests/t523-operator-f.f90 +++ b/tests/t523-operator-f.f90 @@ -205,15 +205,16 @@ program test & buhex,ceed_vector_active,err) ! Composite Operators - call ceedcompositeoperatorcreate(ceed,op_setup,err) + call ceedoperatorcreatecomposite(ceed,op_setup,err) call ceedoperatorsetname(op_setup,'setup',err) - call ceedcompositeoperatoraddsub(op_setup,op_setuptet,err) - call ceedcompositeoperatoraddsub(op_setup,op_setuphex,err) + call ceedoperatorcompositeaddsub(op_setup,op_setuptet,err) + call ceedoperatorcompositeaddsub(op_setup,op_setuphex,err) - call ceedcompositeoperatorcreate(ceed,op_mass,err) + call ceedoperatorcreatecomposite(ceed,op_mass,err) call ceedoperatorsetname(op_mass,'mass',err) - call ceedcompositeoperatoraddsub(op_mass,op_masstet,err) - call ceedcompositeoperatoraddsub(op_mass,op_masshex,err) + call ceedoperatorsetnumviewtabs(op_mass,1,err) + call ceedoperatorcompositeaddsub(op_mass,op_masstet,err) + call ceedoperatorcompositeaddsub(op_mass,op_masshex,err) ! View call ceedoperatorview(op_setup,err) diff --git a/tests/t523-operator.c b/tests/t523-operator.c index b882379516..a1e2307839 100644 --- a/tests/t523-operator.c +++ b/tests/t523-operator.c @@ -150,21 +150,24 @@ int main(int argc, char **argv) { // Set up Composite Operators // -- Create - CeedCompositeOperatorCreate(ceed, &op_setup); + CeedOperatorCreateComposite(ceed, &op_setup); CeedOperatorSetName(op_setup, "setup"); // -- Add SubOperators - CeedCompositeOperatorAddSub(op_setup, op_setup_tet); - CeedCompositeOperatorAddSub(op_setup, op_setup_hex); + CeedOperatorCompositeAddSub(op_setup, op_setup_tet); + CeedOperatorCompositeAddSub(op_setup, op_setup_hex); // -- Create - CeedCompositeOperatorCreate(ceed, &op_mass); + CeedOperatorCreateComposite(ceed, &op_mass); CeedOperatorSetName(op_mass, "mass"); // -- Add SubOperators - CeedCompositeOperatorAddSub(op_mass, op_mass_tet); - CeedCompositeOperatorAddSub(op_mass, op_mass_hex); + CeedOperatorCompositeAddSub(op_mass, op_mass_tet); + CeedOperatorCompositeAddSub(op_mass, op_mass_hex); // View + CeedOperatorViewTerse(op_setup, stdout); CeedOperatorView(op_setup, stdout); + CeedOperatorSetNumViewTabs(op_mass, 1); + CeedOperatorViewTerse(op_mass, stdout); CeedOperatorView(op_mass, stdout); // Cleanup diff --git a/tests/t524-operator-f.f90 b/tests/t524-operator-f.f90 index 4639442a5c..16b041c09a 100644 --- a/tests/t524-operator-f.f90 +++ b/tests/t524-operator-f.f90 @@ -215,13 +215,13 @@ program test & buhex,ceed_vector_active,err) ! Composite Operators - call ceedcompositeoperatorcreate(ceed,op_setup,err) - call ceedcompositeoperatoraddsub(op_setup,op_setuptet,err) - call ceedcompositeoperatoraddsub(op_setup,op_setuphex,err) + call ceedoperatorcreatecomposite(ceed,op_setup,err) + call ceedoperatorcompositeaddsub(op_setup,op_setuptet,err) + call ceedoperatorcompositeaddsub(op_setup,op_setuphex,err) - call ceedcompositeoperatorcreate(ceed,op_mass,err) - call ceedcompositeoperatoraddsub(op_mass,op_masstet,err) - call ceedcompositeoperatoraddsub(op_mass,op_masshex,err) + call ceedoperatorcreatecomposite(ceed,op_mass,err) + call ceedoperatorcompositeaddsub(op_mass,op_masstet,err) + call ceedoperatorcompositeaddsub(op_mass,op_masshex,err) ! Apply Setup Operator call ceedoperatorapply(op_setup,x,ceed_vector_none,& diff --git a/tests/t524-operator.c b/tests/t524-operator.c index fec0fe6ccd..3d61a563b3 100644 --- a/tests/t524-operator.c +++ b/tests/t524-operator.c @@ -155,13 +155,13 @@ int main(int argc, char **argv) { CeedOperatorSetField(op_mass_hex, "v", elem_restriction_u_hex, basis_u_hex, CEED_VECTOR_ACTIVE); // Composite Operators - CeedCompositeOperatorCreate(ceed, &op_setup); - CeedCompositeOperatorAddSub(op_setup, op_setup_tet); - CeedCompositeOperatorAddSub(op_setup, op_setup_hex); + CeedOperatorCreateComposite(ceed, &op_setup); + CeedOperatorCompositeAddSub(op_setup, op_setup_tet); + CeedOperatorCompositeAddSub(op_setup, op_setup_hex); - CeedCompositeOperatorCreate(ceed, &op_mass); - CeedCompositeOperatorAddSub(op_mass, op_mass_tet); - CeedCompositeOperatorAddSub(op_mass, op_mass_hex); + CeedOperatorCreateComposite(ceed, &op_mass); + CeedOperatorCompositeAddSub(op_mass, op_mass_tet); + CeedOperatorCompositeAddSub(op_mass, op_mass_hex); // Apply Setup Operator CeedOperatorApply(op_setup, x, CEED_VECTOR_NONE, CEED_REQUEST_IMMEDIATE); diff --git a/tests/t525-operator.c b/tests/t525-operator.c index 9d0d80442f..bed1365a77 100644 --- a/tests/t525-operator.c +++ b/tests/t525-operator.c @@ -73,9 +73,9 @@ int main(int argc, char **argv) { CeedOperatorCreate(ceed, qf_sub_2, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_sub_2); // Composite operator - CeedCompositeOperatorCreate(ceed, &op_composite); - CeedCompositeOperatorAddSub(op_composite, op_sub_1); - CeedCompositeOperatorAddSub(op_composite, op_sub_2); + CeedOperatorCreateComposite(ceed, &op_composite); + CeedOperatorCompositeAddSub(op_composite, op_sub_1); + CeedOperatorCompositeAddSub(op_composite, op_sub_2); // Check setting field in context of single sub-operator for composite operator CeedOperatorGetContextFieldLabel(op_composite, "time", &time_label); @@ -112,6 +112,7 @@ int main(int argc, char **argv) { CeedOperatorGetContext(op_sub_1, &ctx_copy); if (ctx_copy != qf_ctx_sub_1) printf("Incorrect QFunctionContext retrieved"); + CeedQFunctionContextDestroy(&ctx_copy); CeedOperatorGetContext(op_sub_2, &ctx_copy); // Destroys reference to qf_ctx_sub_1 if (ctx_copy != qf_ctx_sub_2) printf("Incorrect QFunctionContext retrieved"); diff --git a/tests/t526-operator.c b/tests/t526-operator.c index 6d66590d15..8e68ab89b3 100644 --- a/tests/t526-operator.c +++ b/tests/t526-operator.c @@ -114,10 +114,10 @@ int main(int argc, char **argv) { // Set up Composite Operator // -- Create - CeedCompositeOperatorCreate(ceed, &op_mass); + CeedOperatorCreateComposite(ceed, &op_mass); // -- Add SubOperators - CeedCompositeOperatorAddSub(op_mass, op_mass_tet); - CeedCompositeOperatorAddSub(op_mass, op_mass_hex); + CeedOperatorCompositeAddSub(op_mass, op_mass_tet); + CeedOperatorCompositeAddSub(op_mass, op_mass_hex); // Estimate FLOPs CeedQFunctionSetUserFlopsEstimate(qf_mass, 1); diff --git a/tests/t530-operator.c b/tests/t530-operator.c index d9d18083b0..60716e4544 100644 --- a/tests/t530-operator.c +++ b/tests/t530-operator.c @@ -94,12 +94,13 @@ int main(int argc, char **argv) { CeedVectorGetArrayRead(qf_assembled, CEED_MEM_HOST, &assembled_array); CeedVectorGetArrayRead(q_data, CEED_MEM_HOST, &q_data_array); - for (CeedInt i = 0; i < num_qpts; i++) + for (CeedInt i = 0; i < num_qpts; i++) { if (fabs(q_data_array[i] - assembled_array[i]) > 1e-9) { // LCOV_EXCL_START printf("Error: qf_assembled[%" CeedInt_FMT "] = %f != %f\n", i, assembled_array[i], q_data_array[i]); // LCOV_EXCL_STOP } + } CeedVectorRestoreArrayRead(qf_assembled, &assembled_array); CeedVectorRestoreArrayRead(q_data, &q_data_array); } diff --git a/tests/t530-operator.h b/tests/t530-operator.h index 01cf47450c..171f9d01df 100644 --- a/tests/t530-operator.h +++ b/tests/t530-operator.h @@ -1,11 +1,11 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause // // This file is part of CEED: http://github.com/ceed -#include +#include CEED_QFUNCTION(setup)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { const CeedScalar *weight = in[0], *J = in[1]; diff --git a/tests/t531-operator-f.f90 b/tests/t531-operator-f.f90 index d78dd92351..a2deb4434d 100644 --- a/tests/t531-operator-f.f90 +++ b/tests/t531-operator-f.f90 @@ -15,7 +15,7 @@ program test integer bx,bu integer qf_setup,qf_diff,qf_diff_lin integer op_setup,op_diff,op_diff_lin - integer qdata,x,a,u,v + integer qdata,x,a,u,v,v_lin integer nelem,p,q,d integer row,col,offset parameter(nelem=6) @@ -28,8 +28,8 @@ program test parameter(ndofs=(nx*2+1)*(ny*2+1)) parameter(nqpts=nelem*q*q) integer indx(nelem*p*p) - real*8 arrx(d*ndofs),vv(ndofs) - integer*8 xoffset,voffset + real*8 arrx(d*ndofs),uu(ndofs),vv(ndofs),vvlin(ndofs) + integer*8 xoffset,uoffset,voffset,vlinoffset character arg*32 @@ -42,14 +42,26 @@ program test ! DoF Coordinates do i=0,nx*2 do j=0,ny*2 - arrx(i+j*(nx*2+1)+0*ndofs+1)=1.d0*i/(2*nx) - arrx(i+j*(nx*2+1)+1*ndofs+1)=1.d0*j/(2*ny) + arrx(i+j*(nx*2+1)+0*ndofs+1)=1.d0*i/(2*nx)+j*0.5 + arrx(i+j*(nx*2+1)+1*ndofs+1)=1.d0*j/(2*ny)+i*0.5 enddo enddo call ceedvectorcreate(ceed,d*ndofs,x,err) xoffset=0 call ceedvectorsetarray(x,ceed_mem_host,ceed_use_pointer,arrx,xoffset,err) +! Input, output arrays + do i=0,nx*2 + do j=0,ny*2 + uu(i+j*(nx*2+1)+1)=i*nx+j*ny + enddo + enddo + call ceedvectorcreate(ceed,ndofs,u,err) + uoffset=0 + call ceedvectorsetarray(u,ceed_mem_host,ceed_use_pointer,uu,uoffset,err) + call ceedvectorcreate(ceed,ndofs,v,err) + call ceedvectorcreate(ceed,ndofs,v_lin,err) + ! Qdata Vector call ceedvectorcreate(ceed,nqpts*d*(d+1)/2,qdata,err) @@ -125,23 +137,8 @@ program test & bu,ceed_vector_active,err) ! Apply original Poisson Operator - call ceedvectorcreate(ceed,ndofs,u,err) - call ceedvectorsetvalue(u,1.d0,err) - call ceedvectorcreate(ceed,ndofs,v,err) - call ceedvectorsetvalue(v,0.d0,err) call ceedoperatorapply(op_diff,u,v,ceed_request_immediate,err) -! Check Output - call ceedvectorgetarrayread(v,ceed_mem_host,vv,voffset,err) - do i=1,ndofs - if (abs(vv(voffset+i))>1.0d-14) then -! LCOV_EXCL_START - write(*,*) 'Error: Operator computed v[i] = ',vv(voffset+i),' != 0.0' -! LCOV_EXCL_STOP - endif - enddo - call ceedvectorrestorearrayread(v,vv,voffset,err) - ! Assemble QFunction call ceedoperatorlinearassembleqfunction(op_diff,a,erestrictlini,& & ceed_request_immediate,err) @@ -165,20 +162,21 @@ program test & bu,ceed_vector_active,err) ! Apply linearized Poisson Operator - call ceedvectorsetvalue(v,0.d0,err) - call ceedoperatorapply(op_diff_lin,u,v,ceed_request_immediate,err) + call ceedoperatorapply(op_diff_lin,u,v_lin,ceed_request_immediate,err) ! Check Output call ceedvectorgetarrayread(v,ceed_mem_host,vv,voffset,err) + call ceedvectorgetarrayread(v_lin,ceed_mem_host,vvlin,vlinoffset,err) do i=1,ndofs - if (abs(vv(voffset+i))>1.0d-14) then + if (abs(vv(voffset+i)-vvlin(vlinoffset+i))>1.0d-14) then ! LCOV_EXCL_START write(*,*) 'Error: Linearized operator computed v[i] = ',vv(voffset+i),& - & ' != 0.0' + & ' != ',vvlin(vlinoffset+i) ! LCOV_EXCL_STOP endif enddo call ceedvectorrestorearrayread(v,vv,voffset,err) + call ceedvectorrestorearrayread(v_lin,vvlin,vlinoffset,err) ! Cleanup call ceedqfunctiondestroy(qf_setup,err) @@ -198,6 +196,7 @@ program test call ceedvectordestroy(a,err) call ceedvectordestroy(u,err) call ceedvectordestroy(v,err) + call ceedvectordestroy(v_lin,err) call ceedvectordestroy(qdata,err) call ceeddestroy(ceed,err) end diff --git a/tests/t531-operator-f.h b/tests/t531-operator-f.h index 20f02ea332..590140632d 100644 --- a/tests/t531-operator-f.h +++ b/tests/t531-operator-f.h @@ -11,8 +11,8 @@ do i=1,q w=u2(i)/(u1(i+q*0)*u1(i+q*3)-u1(i+q*1)*u1(i+q*2)) v1(i+q*0)=w*(u1(i+q*2)*u1(i+q*2)+u1(i+q*3)*u1(i+q*3)) - v1(i+q*1)=-w*(u1(i+q*0)*u1(i+q*2)+u1(i+q*2)*u1(i+q*3)) - v1(i+q*2)=w*(u1(i+q*0)*u1(i+q*0)+u1(i+q*1)*u1(i+q*1)) + v1(i+q*1)=w*(u1(i+q*0)*u1(i+q*0)+u1(i+q*1)*u1(i+q*1)) + v1(i+q*2)=-w*(u1(i+q*0)*u1(i+q*2)+u1(i+q*2)*u1(i+q*3)) enddo ierr=0 diff --git a/tests/t531-operator.c b/tests/t531-operator.c index 9462d49323..b0e09caf8b 100644 --- a/tests/t531-operator.c +++ b/tests/t531-operator.c @@ -14,7 +14,7 @@ int main(int argc, char **argv) { CeedBasis basis_x, basis_u; CeedQFunction qf_setup, qf_diff, qf_diff_assembled; CeedOperator op_setup, op_diff, op_diff_assembled; - CeedVector q_data, x, assembled = NULL, u, v; + CeedVector q_data, x, assembled = NULL, u, v, v_assembled; CeedInt num_elem = 6, p = 3, q = 4, dim = 2; CeedInt nx = 3, ny = 2; CeedInt num_dofs = (nx * 2 + 1) * (ny * 2 + 1), num_qpts = num_elem * q * q; @@ -29,14 +29,26 @@ int main(int argc, char **argv) { for (CeedInt i = 0; i < nx * 2 + 1; i++) { for (CeedInt j = 0; j < ny * 2 + 1; j++) { - x_array[i + j * (nx * 2 + 1) + 0 * num_dofs] = (CeedScalar)i / (2 * nx); - x_array[i + j * (nx * 2 + 1) + 1 * num_dofs] = (CeedScalar)j / (2 * ny); + x_array[i + j * (nx * 2 + 1) + 0 * num_dofs] = (CeedScalar)i / (2 * nx) + 0.5 * j; + x_array[i + j * (nx * 2 + 1) + 1 * num_dofs] = (CeedScalar)j / (2 * ny) + 0.5 * i; } } CeedVectorSetArray(x, CEED_MEM_HOST, CEED_COPY_VALUES, x_array); } CeedVectorCreate(ceed, num_dofs, &u); + { + CeedScalar *u_array; + + CeedVectorGetArrayWrite(u, CEED_MEM_HOST, &u_array); + for (CeedInt i = 0; i < nx * 2 + 1; i++) { + for (CeedInt j = 0; j < ny * 2 + 1; j++) { + u_array[i + j * (nx * 2 + 1)] = i * nx + j * ny; + } + } + CeedVectorRestoreArray(u, &u_array); + } CeedVectorCreate(ceed, num_dofs, &v); + CeedVectorCreate(ceed, num_dofs, &v_assembled); CeedVectorCreate(ceed, num_qpts * dim * (dim + 1) / 2, &q_data); // Restrictions @@ -88,20 +100,8 @@ int main(int argc, char **argv) { CeedOperatorSetField(op_diff, "dv", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE); // Apply original Poisson Operator - CeedVectorSetValue(u, 1.0); CeedOperatorApply(op_diff, u, v, CEED_REQUEST_IMMEDIATE); - // Check output - { - const CeedScalar *v_array; - - CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array); - for (CeedInt i = 0; i < num_dofs; i++) { - if (fabs(v_array[i]) > 100. * CEED_EPSILON) printf("Error: Operator computed v[%" CeedInt_FMT "] = %f != 0.0\n", i, v_array[i]); - } - CeedVectorRestoreArrayRead(v, &v_array); - } - // Assemble QFunction CeedOperatorSetQFunctionAssemblyReuse(op_diff, true); CeedOperatorLinearAssembleQFunctionBuildOrUpdate(op_diff, &assembled, &elem_restriction_assembled, CEED_REQUEST_IMMEDIATE); @@ -122,18 +122,23 @@ int main(int argc, char **argv) { CeedOperatorSetField(op_diff_assembled, "dv", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE); // Apply new Poisson Operator - CeedVectorSetValue(v, 0.0); - CeedOperatorApply(op_diff_assembled, u, v, CEED_REQUEST_IMMEDIATE); + CeedOperatorApply(op_diff_assembled, u, v_assembled, CEED_REQUEST_IMMEDIATE); // Check output { - const CeedScalar *v_array; + const CeedScalar *v_array, *v_assembled_array; CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array); + CeedVectorGetArrayRead(v_assembled, CEED_MEM_HOST, &v_assembled_array); for (CeedInt i = 0; i < num_dofs; i++) { - if (fabs(v_array[i]) > 100. * CEED_EPSILON) printf("Error: Linearized operator computed v[i] = %f != 0.0\n", v_array[i]); + if (fabs(v_array[i] - v_assembled_array[i]) > 100. * CEED_EPSILON) { + // LCOV_EXCL_START + printf("Error: Linearized operator computed v[%d] = %f != %f\n", i, v_assembled_array[i], v_array[i]); + // LCOV_EXCL_STOP + } } CeedVectorRestoreArrayRead(v, &v_array); + CeedVectorRestoreArrayRead(v_assembled, &v_assembled_array); } // Cleanup @@ -142,6 +147,7 @@ int main(int argc, char **argv) { CeedVectorDestroy(&q_data); CeedVectorDestroy(&u); CeedVectorDestroy(&v); + CeedVectorDestroy(&v_assembled); CeedElemRestrictionDestroy(&elem_restriction_u); CeedElemRestrictionDestroy(&elem_restriction_x); CeedElemRestrictionDestroy(&elem_restriction_q_data); diff --git a/tests/t531-operator.h b/tests/t531-operator.h index a9f69f6bd5..f1c3ccab25 100644 --- a/tests/t531-operator.h +++ b/tests/t531-operator.h @@ -1,11 +1,11 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause // // This file is part of CEED: http://github.com/ceed -#include +#include CEED_QFUNCTION(setup)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { // At every quadrature point, compute qw/det(J).adj(J).adj(J)^T and store @@ -28,8 +28,8 @@ CEED_QFUNCTION(setup)(void *ctx, const CeedInt Q, const CeedScalar *const *in, C const CeedScalar J22 = J[i + Q * 3]; const CeedScalar w = qw[i] / (J11 * J22 - J21 * J12); qd[i + Q * 0] = w * (J12 * J12 + J22 * J22); - qd[i + Q * 2] = w * (J11 * J11 + J21 * J21); - qd[i + Q * 1] = -w * (J11 * J12 + J21 * J22); + qd[i + Q * 1] = w * (J11 * J11 + J21 * J21); + qd[i + Q * 2] = -w * (J11 * J12 + J21 * J22); } return 0; @@ -50,7 +50,6 @@ CEED_QFUNCTION(diff)(void *ctx, const CeedInt Q, const CeedScalar *const *in, Ce dv[i + Q * 0] = qd[i + Q * 0] * du0 + qd[i + Q * 2] * du1; dv[i + Q * 1] = qd[i + Q * 2] * du0 + qd[i + Q * 1] * du1; } - return 0; } diff --git a/tests/t532-operator.h b/tests/t532-operator.h index e15e3aed19..b81f87dbc6 100644 --- a/tests/t532-operator.h +++ b/tests/t532-operator.h @@ -1,11 +1,11 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause // // This file is part of CEED: http://github.com/ceed -#include +#include CEED_QFUNCTION(setup_mass)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { const CeedScalar *J = in[0], *weight = in[1]; diff --git a/tests/t533-operator.c b/tests/t533-operator.c index a01dabda4a..4ac1c523de 100644 --- a/tests/t533-operator.c +++ b/tests/t533-operator.c @@ -28,11 +28,12 @@ int main(int argc, char **argv) { { CeedScalar x_array[dim * num_dofs]; - for (CeedInt i = 0; i < nx * 2 + 1; i++) + for (CeedInt i = 0; i < nx * 2 + 1; i++) { for (CeedInt j = 0; j < ny * 2 + 1; j++) { x_array[i + j * (nx * 2 + 1) + 0 * num_dofs] = (CeedScalar)i / (2 * nx); x_array[i + j * (nx * 2 + 1) + 1 * num_dofs] = (CeedScalar)j / (2 * ny); } + } CeedVectorSetArray(x, CEED_MEM_HOST, CEED_COPY_VALUES, x_array); } CeedVectorCreate(ceed, num_dofs, &u); @@ -45,8 +46,9 @@ int main(int argc, char **argv) { col = i % nx; row = i / nx; offset = col * (p - 1) + row * (nx * 2 + 1) * (p - 1); - for (CeedInt j = 0; j < p; j++) + for (CeedInt j = 0; j < p; j++) { for (CeedInt k = 0; k < p; k++) ind_x[p * (p * i + k) + j] = offset + k * (nx * 2 + 1) + j; + } } CeedElemRestrictionCreate(ceed, num_elem, p * p, dim, num_dofs, dim * num_dofs, CEED_MEM_HOST, CEED_USE_POINTER, ind_x, &elem_restriction_x); CeedElemRestrictionCreate(ceed, num_elem, p * p, 1, 1, num_dofs, CEED_MEM_HOST, CEED_USE_POINTER, ind_x, &elem_restriction_u); @@ -89,7 +91,7 @@ int main(int argc, char **argv) { // Manually assemble diagonal CeedVectorSetValue(u, 0.0); - for (int i = 0; i < num_dofs; i++) { + for (CeedInt i = 0; i < num_dofs; i++) { CeedScalar *u_array; const CeedScalar *v_array; @@ -113,7 +115,7 @@ int main(int argc, char **argv) { const CeedScalar *assembled_array; CeedVectorGetArrayRead(assembled, CEED_MEM_HOST, &assembled_array); - for (int i = 0; i < num_dofs; i++) { + for (CeedInt i = 0; i < num_dofs; i++) { if (fabs(assembled_array[i] - assembled_true[i]) > 100. * CEED_EPSILON) { // LCOV_EXCL_START printf("[%" CeedInt_FMT "] Error in assembly: %f != %f\n", i, assembled_array[i], assembled_true[i]); diff --git a/tests/t534-operator.h b/tests/t534-operator.h index 3fc4c58887..518481a070 100644 --- a/tests/t534-operator.h +++ b/tests/t534-operator.h @@ -1,11 +1,11 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause // // This file is part of CEED: http://github.com/ceed -#include +#include CEED_QFUNCTION(setup)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { // At every quadrature point, compute qw/det(J).adj(J).adj(J)^T and store diff --git a/tests/t535-operator.h b/tests/t535-operator.h index 7f6797608c..fc62a6ca0d 100644 --- a/tests/t535-operator.h +++ b/tests/t535-operator.h @@ -1,11 +1,11 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause // // This file is part of CEED: http://github.com/ceed -#include +#include CEED_QFUNCTION(setup_mass)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { const CeedScalar *J = in[0], *weight = in[1]; diff --git a/tests/t537-operator.h b/tests/t537-operator.h index 80b2d22d73..f08c690d12 100644 --- a/tests/t537-operator.h +++ b/tests/t537-operator.h @@ -1,11 +1,11 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause // // This file is part of CEED: http://github.com/ceed -#include +#include CEED_QFUNCTION(setup)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { const CeedScalar *weight = in[0], *J = in[1]; diff --git a/tests/t538-operator.c b/tests/t538-operator.c index 45e86ecdff..0e5267019c 100644 --- a/tests/t538-operator.c +++ b/tests/t538-operator.c @@ -104,9 +104,9 @@ int main(int argc, char **argv) { CeedOperatorSetField(op_diff, "dv", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE); // Composite operator - CeedCompositeOperatorCreate(ceed, &op_apply); - CeedCompositeOperatorAddSub(op_apply, op_mass); - CeedCompositeOperatorAddSub(op_apply, op_diff); + CeedOperatorCreateComposite(ceed, &op_apply); + CeedOperatorCompositeAddSub(op_apply, op_mass); + CeedOperatorCompositeAddSub(op_apply, op_diff); // Assemble diagonal CeedVectorCreate(ceed, num_dofs, &assembled); diff --git a/tests/t539-operator.h b/tests/t539-operator.h index 3a4fda2475..65eaa85554 100644 --- a/tests/t539-operator.h +++ b/tests/t539-operator.h @@ -1,11 +1,11 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause // // This file is part of CEED: http://github.com/ceed -#include +#include CEED_QFUNCTION(apply)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { // in[0] is gradient u_0, shape [2, num_comp=2, Q] diff --git a/tests/t540-operator.h b/tests/t540-operator.h index 79f5006719..0259af529c 100644 --- a/tests/t540-operator.h +++ b/tests/t540-operator.h @@ -1,11 +1,11 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause // // This file is part of CEED: http://github.com/ceed -#include +#include CEED_QFUNCTION(setup_mass)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { const CeedScalar *J = in[0], *weight = in[1]; diff --git a/tests/t541-operator.h b/tests/t541-operator.h index 7eaa675c97..a8a3424f78 100644 --- a/tests/t541-operator.h +++ b/tests/t541-operator.h @@ -1,11 +1,11 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause // // This file is part of CEED: http://github.com/ceed -#include +#include CEED_QFUNCTION(setup_diff)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { // in[0] is Jacobians with shape [2, nc=2, Q] diff --git a/tests/t554-operator.c b/tests/t554-operator.c index 0ca19605a9..d63c548696 100644 --- a/tests/t554-operator.c +++ b/tests/t554-operator.c @@ -33,10 +33,10 @@ int main(int argc, char **argv) { CeedVectorCreate(ceed, num_comp * num_dofs_u_fine, &v_fine); // Composite operators - CeedCompositeOperatorCreate(ceed, &op_mass_coarse); - CeedCompositeOperatorCreate(ceed, &op_mass_fine); - CeedCompositeOperatorCreate(ceed, &op_prolong); - CeedCompositeOperatorCreate(ceed, &op_restrict); + CeedOperatorCreateComposite(ceed, &op_mass_coarse); + CeedOperatorCreateComposite(ceed, &op_mass_fine); + CeedOperatorCreateComposite(ceed, &op_prolong); + CeedOperatorCreateComposite(ceed, &op_restrict); // Setup fine suboperators for (CeedInt i = 0; i < num_sub_ops; i++) { @@ -99,7 +99,7 @@ int main(int argc, char **argv) { CeedOperatorApply(sub_op_setup, x, q_data, CEED_REQUEST_IMMEDIATE); // -- Composite operators - CeedCompositeOperatorAddSub(op_mass_fine, sub_op_mass_fine); + CeedOperatorCompositeAddSub(op_mass_fine, sub_op_mass_fine); // -- Cleanup CeedVectorDestroy(&q_data); @@ -116,7 +116,7 @@ int main(int argc, char **argv) { // Scale for suboperator multiplicity CeedVectorCreate(ceed, num_comp * num_dofs_u_fine, &p_mult_fine); - CeedCompositeOperatorGetMultiplicity(op_mass_fine, 0, NULL, p_mult_fine); + CeedOperatorCompositeGetMultiplicity(op_mass_fine, 0, NULL, p_mult_fine); // Setup coarse and prolong/restriction suboperators for (CeedInt i = 0; i < num_sub_ops; i++) { @@ -125,7 +125,7 @@ int main(int argc, char **argv) { CeedOperator *sub_ops_mass_fine, sub_op_mass_coarse, sub_op_prolong, sub_op_restrict; // -- Fine grid operator - CeedCompositeOperatorGetSubList(op_mass_fine, &sub_ops_mass_fine); + CeedOperatorCompositeGetSubList(op_mass_fine, &sub_ops_mass_fine); // -- Restrictions CeedInt offset = num_elem_sub * i * (p_coarse - 1); @@ -145,9 +145,9 @@ int main(int argc, char **argv) { &sub_op_prolong, &sub_op_restrict); // -- Composite operators - CeedCompositeOperatorAddSub(op_mass_coarse, sub_op_mass_coarse); - CeedCompositeOperatorAddSub(op_prolong, sub_op_prolong); - CeedCompositeOperatorAddSub(op_restrict, sub_op_restrict); + CeedOperatorCompositeAddSub(op_mass_coarse, sub_op_mass_coarse); + CeedOperatorCompositeAddSub(op_prolong, sub_op_prolong); + CeedOperatorCompositeAddSub(op_restrict, sub_op_restrict); // -- Cleanup CeedElemRestrictionDestroy(&elem_restriction_u_coarse); diff --git a/tests/t565-operator.c b/tests/t565-operator.c index b5a542451f..8ed3e0ea5f 100644 --- a/tests/t565-operator.c +++ b/tests/t565-operator.c @@ -107,9 +107,9 @@ int main(int argc, char **argv) { CeedOperatorSetField(op_diff, "dv", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE); // Composite operator - CeedCompositeOperatorCreate(ceed, &op_apply); - CeedCompositeOperatorAddSub(op_apply, op_mass); - CeedCompositeOperatorAddSub(op_apply, op_diff); + CeedOperatorCreateComposite(ceed, &op_apply); + CeedOperatorCompositeAddSub(op_apply, op_mass); + CeedOperatorCompositeAddSub(op_apply, op_diff); // Fully assemble operator CeedSize num_entries; diff --git a/tests/t566-operator.h b/tests/t566-operator.h index dfd0da43a2..c227b7d834 100644 --- a/tests/t566-operator.h +++ b/tests/t566-operator.h @@ -1,11 +1,11 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause // // This file is part of CEED: http://github.com/ceed -#include +#include CEED_QFUNCTION(setup)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { const CeedScalar *weight = in[0], *J = in[1]; diff --git a/tests/t567-operator.h b/tests/t567-operator.h index 6b645272dc..997b6db1bb 100644 --- a/tests/t567-operator.h +++ b/tests/t567-operator.h @@ -1,11 +1,11 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause // // This file is part of CEED: http://github.com/ceed -#include +#include CEED_QFUNCTION(setup)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { const CeedScalar *w = in[0], (*J)[2][CEED_Q_VLA] = (const CeedScalar(*)[2][CEED_Q_VLA])in[1]; diff --git a/tests/t568-operator.h b/tests/t568-operator.h index d52bc2d800..6c38bb04c7 100644 --- a/tests/t568-operator.h +++ b/tests/t568-operator.h @@ -1,11 +1,11 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause // // This file is part of CEED: http://github.com/ceed -#include +#include CEED_QFUNCTION(setup)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { // At every quadrature point, compute qw/det(J).adj(J).adj(J)^T and store diff --git a/tests/t580-operator.h b/tests/t580-operator.h index 940a3605fc..cb7e472fba 100644 --- a/tests/t580-operator.h +++ b/tests/t580-operator.h @@ -1,11 +1,11 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause // // This file is part of CEED: http://github.com/ceed -#include +#include // Compute det(A) CEED_QFUNCTION_HELPER CeedScalar MatDet2x2(const CeedScalar A[2][2]) { return A[0][0] * A[1][1] - A[1][0] * A[0][1]; } diff --git a/tests/t590-operator.h b/tests/t590-operator.h index a2018718f8..c50595bc26 100644 --- a/tests/t590-operator.h +++ b/tests/t590-operator.h @@ -1,11 +1,11 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause // // This file is part of CEED: http://github.com/ceed -#include +#include CEED_QFUNCTION(mass)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { const CeedScalar *u = in[0]; diff --git a/tests/t591-operator.h b/tests/t591-operator.h index 1c64f1181f..0a834e5056 100644 --- a/tests/t591-operator.h +++ b/tests/t591-operator.h @@ -1,11 +1,11 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause // // This file is part of CEED: http://github.com/ceed -#include +#include CEED_QFUNCTION(setup)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { const CeedScalar(*J)[2][CEED_Q_VLA] = (const CeedScalar(*)[2][CEED_Q_VLA])in[0]; diff --git a/tests/t592-operator.c b/tests/t592-operator.c index e0ccdace2e..1650e0fa89 100644 --- a/tests/t592-operator.c +++ b/tests/t592-operator.c @@ -1,6 +1,6 @@ /// @file /// Test assembly of mass matrix operator QFunction at points -/// \test Test assembly of mass matrix operator QFunction +/// \test Test assembly of mass matrix operator QFunction at points #include #include #include @@ -173,12 +173,13 @@ int main(int argc, char **argv) { CeedVectorGetArrayRead(qf_assembled, CEED_MEM_HOST, &assembled_array); CeedVectorGetArrayRead(q_data, CEED_MEM_HOST, &q_data_array); - for (CeedInt i = 0; i < num_points; i++) + for (CeedInt i = 0; i < num_points; i++) { if (fabs(q_data_array[i] - assembled_array[i]) > 1e-9) { // LCOV_EXCL_START printf("Error: qf_assembled[%" CeedInt_FMT "] = %f != %f\n", i, assembled_array[i], q_data_array[i]); // LCOV_EXCL_STOP } + } CeedVectorRestoreArrayRead(qf_assembled, &assembled_array); CeedVectorRestoreArrayRead(q_data, &q_data_array); } diff --git a/tests/t593-operator.c b/tests/t593-operator.c index 5b145d0884..2e0710c7fc 100644 --- a/tests/t593-operator.c +++ b/tests/t593-operator.c @@ -1,5 +1,5 @@ /// @file -/// Bug reproducer for memcheck backends at points +/// Test 1D mass matrix operator at points with heterogeneous points per element /// \test Test 1D mass matrix operator at points with heterogeneous points per element #include #include @@ -85,13 +85,13 @@ int main(int argc, char **argv) { // Setup geometric scaling CeedQFunctionCreateInterior(ceed, 1, setup, setup_loc, &qf_setup); - CeedQFunctionAddInput(qf_setup, "x", dim * dim, CEED_EVAL_GRAD); CeedQFunctionAddInput(qf_setup, "weight", 1, CEED_EVAL_WEIGHT); + CeedQFunctionAddInput(qf_setup, "x", dim * dim, CEED_EVAL_GRAD); CeedQFunctionAddOutput(qf_setup, "rho", 1, CEED_EVAL_NONE); CeedOperatorCreateAtPoints(ceed, qf_setup, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_setup); - CeedOperatorSetField(op_setup, "x", elem_restriction_x, basis_x, CEED_VECTOR_ACTIVE); CeedOperatorSetField(op_setup, "weight", CEED_ELEMRESTRICTION_NONE, basis_x, CEED_VECTOR_NONE); + CeedOperatorSetField(op_setup, "x", elem_restriction_x, basis_x, CEED_VECTOR_ACTIVE); CeedOperatorSetField(op_setup, "rho", elem_restriction_q_data, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE); CeedOperatorAtPointsSetPoints(op_setup, elem_restriction_x_points, x_points); diff --git a/tests/t594-operator.c b/tests/t594-operator.c new file mode 100644 index 0000000000..49405e37a4 --- /dev/null +++ b/tests/t594-operator.c @@ -0,0 +1,179 @@ +/// @file +/// Test diagonal assembly of mass matrix operator at points +/// \test Test diagonal assembly of mass matrix operator at points +#include +#include +#include +#include + +#include "t500-operator.h" + +int main(int argc, char **argv) { + Ceed ceed; + CeedInt num_elem = 3, dim = 1, p = 3, q = 5; + CeedInt num_nodes_x = num_elem + 1, num_nodes_u = num_elem * (p - 1) + 1, num_points_per_elem = 4, num_points = num_elem * num_points_per_elem; + CeedInt ind_x[num_elem * 2], ind_u[num_elem * p], ind_x_points[num_elem + 1 + num_points]; + CeedScalar x_array_mesh[num_nodes_x], x_array_points[num_points], assembled_true[num_nodes_u]; + CeedVector x_points = NULL, x_elem = NULL, q_data = NULL, u = NULL, v = NULL, assembled = NULL; + CeedElemRestriction elem_restriction_x_points, elem_restriction_q_data, elem_restriction_x, elem_restriction_u; + CeedBasis basis_x, basis_u; + CeedQFunction qf_setup, qf_mass; + CeedOperator op_setup, op_mass; + bool is_at_points; + + CeedInit(argv[1], &ceed); + + // Mesh coordinates + for (CeedInt i = 0; i < num_nodes_x; i++) x_array_mesh[i] = (CeedScalar)i / (num_nodes_x - 1); + for (CeedInt i = 0; i < num_elem; i++) { + ind_x[2 * i + 0] = i; + ind_x[2 * i + 1] = i + 1; + } + CeedElemRestrictionCreate(ceed, num_elem, 2, 1, 1, num_nodes_x, CEED_MEM_HOST, CEED_USE_POINTER, ind_x, &elem_restriction_x); + CeedVectorCreate(ceed, num_nodes_x, &x_elem); + CeedVectorSetArray(x_elem, CEED_MEM_HOST, CEED_USE_POINTER, x_array_mesh); + + // U mesh + for (CeedInt i = 0; i < num_elem; i++) { + for (CeedInt j = 0; j < p; j++) { + ind_u[p * i + j] = i * (p - 1) + j; + } + } + CeedElemRestrictionCreate(ceed, num_elem, p, 1, 1, num_nodes_u, CEED_MEM_HOST, CEED_USE_POINTER, ind_u, &elem_restriction_u); + + // Point reference coordinates + { + CeedScalar weight_tmp[num_points_per_elem + 1]; + CeedInt current_index = 0; + + // Use num_points_per_elem + 1 to test non-uniform quadrature + CeedGaussQuadrature(num_points_per_elem + 1, x_array_points, weight_tmp); + ind_x_points[0] = num_elem + 1; + for (CeedInt p = 0; p < num_points_per_elem + 1; p++, current_index++) { + ind_x_points[num_elem + 1 + current_index] = current_index; + } + // Use num_points_per_elem for middle elements + for (CeedInt e = 1; e < num_elem - 1; e++) { + CeedGaussQuadrature(num_points_per_elem, &x_array_points[current_index], weight_tmp); + ind_x_points[e] = num_elem + 1 + current_index; + for (CeedInt p = 0; p < num_points_per_elem; p++, current_index++) { + ind_x_points[num_elem + 1 + current_index] = current_index; + } + } + // Use num_points_per_elem - 1 to test non-uniform quadrature + CeedGaussQuadrature(num_points_per_elem - 1, &x_array_points[current_index], weight_tmp); + ind_x_points[num_elem - 1] = num_elem + 1 + current_index; + for (CeedInt p = 0; p < num_points_per_elem - 1; p++, current_index++) { + ind_x_points[num_elem + 1 + current_index] = current_index; + } + ind_x_points[num_elem] = num_elem + 1 + current_index; + + CeedVectorCreate(ceed, num_elem * num_points_per_elem, &x_points); + CeedVectorSetArray(x_points, CEED_MEM_HOST, CEED_USE_POINTER, x_array_points); + CeedElemRestrictionCreateAtPoints(ceed, num_elem, num_points, 1, num_points, CEED_MEM_HOST, CEED_COPY_VALUES, ind_x_points, + &elem_restriction_x_points); + CeedElemRestrictionCreateAtPoints(ceed, num_elem, num_points, 1, num_points, CEED_MEM_HOST, CEED_COPY_VALUES, ind_x_points, + &elem_restriction_q_data); + + // Q data + CeedVectorCreate(ceed, num_points, &q_data); + } + + // Basis creation + CeedBasisCreateTensorH1Lagrange(ceed, dim, dim, 2, q, CEED_GAUSS, &basis_x); + CeedBasisCreateTensorH1Lagrange(ceed, dim, 1, p, q, CEED_GAUSS, &basis_u); + + // Setup geometric scaling + CeedQFunctionCreateInterior(ceed, 1, setup, setup_loc, &qf_setup); + CeedQFunctionAddInput(qf_setup, "weight", 1, CEED_EVAL_WEIGHT); + CeedQFunctionAddInput(qf_setup, "x", dim * dim, CEED_EVAL_GRAD); + CeedQFunctionAddOutput(qf_setup, "rho", 1, CEED_EVAL_NONE); + + CeedOperatorCreateAtPoints(ceed, qf_setup, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_setup); + CeedOperatorSetField(op_setup, "weight", CEED_ELEMRESTRICTION_NONE, basis_x, CEED_VECTOR_NONE); + CeedOperatorSetField(op_setup, "x", elem_restriction_x, basis_x, CEED_VECTOR_ACTIVE); + CeedOperatorSetField(op_setup, "rho", elem_restriction_q_data, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE); + CeedOperatorAtPointsSetPoints(op_setup, elem_restriction_x_points, x_points); + + CeedOperatorApply(op_setup, x_elem, q_data, CEED_REQUEST_IMMEDIATE); + + // Mass operator + CeedQFunctionCreateInterior(ceed, 1, mass, mass_loc, &qf_mass); + CeedQFunctionAddInput(qf_mass, "u", 1, CEED_EVAL_INTERP); + CeedQFunctionAddInput(qf_mass, "rho", 1, CEED_EVAL_NONE); + CeedQFunctionAddOutput(qf_mass, "v", 1, CEED_EVAL_INTERP); + + CeedOperatorCreateAtPoints(ceed, qf_mass, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_mass); + CeedOperatorSetField(op_mass, "u", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE); + CeedOperatorSetField(op_mass, "rho", elem_restriction_q_data, CEED_BASIS_NONE, q_data); + CeedOperatorSetField(op_mass, "v", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE); + CeedOperatorAtPointsSetPoints(op_mass, elem_restriction_x_points, x_points); + + CeedOperatorIsAtPoints(op_mass, &is_at_points); + if (!is_at_points) printf("Error: Operator should be at points\n"); + + CeedVectorCreate(ceed, num_nodes_u, &u); + CeedVectorSetValue(u, 0.0); + CeedVectorCreate(ceed, num_nodes_u, &v); + + // Assemble diagonal + CeedVectorCreate(ceed, num_nodes_u, &assembled); + CeedOperatorLinearAssembleDiagonal(op_mass, assembled, CEED_REQUEST_IMMEDIATE); + + // Manually assemble diagonal + CeedVectorSetValue(u, 0.0); + for (CeedInt i = 0; i < num_nodes_u; i++) { + CeedScalar *u_array; + const CeedScalar *v_array; + + // Set input + CeedVectorGetArray(u, CEED_MEM_HOST, &u_array); + u_array[i] = 1.0; + if (i) u_array[i - 1] = 0.0; + CeedVectorRestoreArray(u, &u_array); + + // Compute diag entry for DoF i + CeedOperatorApply(op_mass, u, v, CEED_REQUEST_IMMEDIATE); + + // Retrieve entry + CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array); + assembled_true[i] = v_array[i]; + CeedVectorRestoreArrayRead(v, &v_array); + } + + // Check output + { + const CeedScalar *assembled_array; + + CeedVectorGetArrayRead(assembled, CEED_MEM_HOST, &assembled_array); + for (CeedInt i = 0; i < num_nodes_u; i++) { + if (fabs(assembled_array[i] - assembled_true[i]) > 100. * CEED_EPSILON) { + // LCOV_EXCL_START + printf("[%" CeedInt_FMT "] Error in assembly: %f != %f\n", i, assembled_array[i], assembled_true[i]); + // LCOV_EXCL_STOP + } + } + CeedVectorRestoreArrayRead(assembled, &assembled_array); + } + + // Cleanup + CeedVectorDestroy(&q_data); + CeedVectorDestroy(&u); + CeedVectorDestroy(&v); + CeedVectorDestroy(&x_points); + CeedVectorDestroy(&q_data); + CeedVectorDestroy(&x_elem); + CeedVectorDestroy(&assembled); + CeedElemRestrictionDestroy(&elem_restriction_q_data); + CeedElemRestrictionDestroy(&elem_restriction_x); + CeedElemRestrictionDestroy(&elem_restriction_x_points); + CeedElemRestrictionDestroy(&elem_restriction_u); + CeedBasisDestroy(&basis_x); + CeedBasisDestroy(&basis_u); + CeedQFunctionDestroy(&qf_setup); + CeedQFunctionDestroy(&qf_mass); + CeedOperatorDestroy(&op_setup); + CeedOperatorDestroy(&op_mass); + CeedDestroy(&ceed); + return 0; +} diff --git a/tests/t595-operator.c b/tests/t595-operator.c new file mode 100644 index 0000000000..e874ccb2ba --- /dev/null +++ b/tests/t595-operator.c @@ -0,0 +1,125 @@ +/// @file +/// Test FLOP estimation for mass matrix operator at points +/// \test Test FLOP estimation for mass matrix operator at points +#include "t595-operator.h" + +#include +#include +#include +#include + +int main(int argc, char **argv) { + Ceed ceed; + CeedInt num_elem_1d = 3, num_elem = num_elem_1d * num_elem_1d, dim = 2, p = 3, q = 5; + CeedInt num_nodes = (num_elem_1d * (p - 1) + 1) * (num_elem_1d * (p - 1) + 1), num_points_per_elem = 4, num_points = num_elem * num_points_per_elem; + CeedSize flop_estimate = 0; + CeedVector x_points, q_data; + CeedElemRestriction elem_restriction_x_points, elem_restriction_q_data, elem_restriction_u; + CeedBasis basis_x, basis_u; + CeedQFunction qf_mass; + CeedOperator op_mass; + bool is_at_points; + + CeedInit(argv[1], &ceed); + + // Point reference coordinates + CeedVectorCreate(ceed, dim * num_points, &x_points); + { + CeedScalar x_array[dim * num_points]; + + for (CeedInt e = 0; e < num_elem; e++) { + for (CeedInt d = 0; d < dim; d++) { + x_array[num_points_per_elem * (e * dim + d) + 0] = 0.25; + x_array[num_points_per_elem * (e * dim + d) + 1] = d == 0 ? -0.25 : 0.25; + x_array[num_points_per_elem * (e * dim + d) + 2] = d == 0 ? 0.25 : -0.25; + x_array[num_points_per_elem * (e * dim + d) + 3] = 0.25; + } + } + CeedVectorSetArray(x_points, CEED_MEM_HOST, CEED_COPY_VALUES, x_array); + } + { + CeedInt ind_x[num_elem + 1 + num_points]; + + for (CeedInt i = 0; i <= num_elem; i++) ind_x[i] = num_elem + 1 + i * num_points_per_elem; + for (CeedInt i = 0; i < num_points; i++) ind_x[num_elem + 1 + i] = i; + CeedElemRestrictionCreateAtPoints(ceed, num_elem, num_points, dim, num_points * dim, CEED_MEM_HOST, CEED_COPY_VALUES, ind_x, + &elem_restriction_x_points); + CeedElemRestrictionCreateAtPoints(ceed, num_elem, num_points, 1, num_points, CEED_MEM_HOST, CEED_COPY_VALUES, ind_x, &elem_restriction_q_data); + } + + // Q data + CeedVectorCreate(ceed, num_points, &q_data); + + CeedBasisCreateTensorH1Lagrange(ceed, dim, dim, 2, q, CEED_GAUSS, &basis_x); + + // Cell solution + { + CeedInt ind_u[num_elem * p * p]; + + for (CeedInt e = 0; e < num_elem; e++) { + CeedInt elem_xy[2] = {1, 1}, n_d[2] = {0, 0}; + + for (CeedInt d = 0; d < dim; d++) n_d[d] = num_elem_1d * (p - 1) + 1; + { + CeedInt r_e = e; + + for (CeedInt d = 0; d < dim; d++) { + elem_xy[d] = r_e % num_elem_1d; + r_e /= num_elem_1d; + } + } + CeedInt num_nodes_in_elem = p * p, *elem_nodes = ind_u + e * num_nodes_in_elem; + + for (CeedInt n = 0; n < num_nodes_in_elem; n++) { + CeedInt g_node = 0, g_node_stride = 1, r_node = n; + + for (CeedInt d = 0; d < dim; d++) { + g_node += (elem_xy[d] * (p - 1) + r_node % p) * g_node_stride; + g_node_stride *= n_d[d]; + r_node /= p; + } + elem_nodes[n] = g_node; + } + } + CeedElemRestrictionCreate(ceed, num_elem, p * p, 1, 1, num_nodes, CEED_MEM_HOST, CEED_COPY_VALUES, ind_u, &elem_restriction_u); + } + CeedBasisCreateTensorH1Lagrange(ceed, dim, 1, p, q, CEED_GAUSS, &basis_u); + + // Mass operator + CeedQFunctionCreateInterior(ceed, 1, mass, mass_loc, &qf_mass); + CeedQFunctionAddInput(qf_mass, "u", 1, CEED_EVAL_INTERP); + CeedQFunctionAddInput(qf_mass, "rho", 1, CEED_EVAL_NONE); + CeedQFunctionAddOutput(qf_mass, "v", 1, CEED_EVAL_INTERP); + + CeedOperatorCreateAtPoints(ceed, qf_mass, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_mass); + CeedOperatorSetField(op_mass, "u", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE); + CeedOperatorSetField(op_mass, "rho", elem_restriction_q_data, CEED_BASIS_NONE, q_data); + CeedOperatorSetField(op_mass, "v", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE); + CeedOperatorAtPointsSetPoints(op_mass, elem_restriction_x_points, x_points); + + CeedOperatorIsAtPoints(op_mass, &is_at_points); + if (!is_at_points) printf("Error: Operator should be at points\n"); + + // Estimate FLOPs + CeedQFunctionSetUserFlopsEstimate(qf_mass, 1); + CeedOperatorGetFlopsEstimate(op_mass, &flop_estimate); + + // Check output + if (flop_estimate != 16317) { + // LCOV_EXCL_START + printf("Incorrect FLOP estimate computed, %ld != 16317\n", flop_estimate); + // LCOV_EXCL_STOP + } + + CeedVectorDestroy(&x_points); + CeedVectorDestroy(&q_data); + CeedElemRestrictionDestroy(&elem_restriction_x_points); + CeedElemRestrictionDestroy(&elem_restriction_q_data); + CeedElemRestrictionDestroy(&elem_restriction_u); + CeedBasisDestroy(&basis_x); + CeedBasisDestroy(&basis_u); + CeedQFunctionDestroy(&qf_mass); + CeedOperatorDestroy(&op_mass); + CeedDestroy(&ceed); + return 0; +} diff --git a/tests/t595-operator.h b/tests/t595-operator.h new file mode 100644 index 0000000000..a5ddb3b9d7 --- /dev/null +++ b/tests/t595-operator.h @@ -0,0 +1,17 @@ +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. +// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +// +// SPDX-License-Identifier: BSD-2-Clause +// +// This file is part of CEED: http://github.com/ceed + +#include + +CEED_QFUNCTION(mass)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { + const CeedScalar *u = in[0], *rho = in[1]; + CeedScalar *v = out[0]; + + // Quadrature point loop + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { v[i] = rho[i] * u[i]; } + return 0; +} diff --git a/tests/t596-operator.c b/tests/t596-operator.c new file mode 100644 index 0000000000..81ca865ebd --- /dev/null +++ b/tests/t596-operator.c @@ -0,0 +1,202 @@ +/// @file +/// Test full assembly of mass matrix operator +/// \test Test full assembly of mass matrix operator AtPoints +#include +#include +#include +#include + +#include "t596-operator.h" + +int main(int argc, char **argv) { + Ceed ceed; + + CeedInit(argv[1], &ceed); + + for (CeedInt num_comp = 1; num_comp <= 3; num_comp++) { + CeedElemRestriction elem_restriction_x, elem_restriction_x_points, elem_restriction_u, elem_restriction_q_data; + CeedBasis basis_x, basis_u; + CeedQFunction qf_setup, qf_mass; + CeedOperator op_setup, op_mass; + CeedVector q_data, x, x_points, u, v; + CeedInt p = 3, q = 4, dim = 2; + CeedInt n_x = 3, n_y = 2; + CeedInt num_elem = n_x * n_y; + CeedInt num_dofs = (n_x * 2 + 1) * (n_y * 2 + 1), num_points_per_elem = 4, num_points = num_elem * num_points_per_elem; + CeedInt ind_x[num_elem * p * p]; + CeedScalar assembled_values[num_comp * num_comp * num_dofs * num_dofs]; + CeedScalar assembled_true[num_comp * num_comp * num_dofs * num_dofs]; + + // Points + CeedVectorCreate(ceed, dim * num_points, &x_points); + { + CeedScalar x_array[dim * num_points]; + + for (CeedInt e = 0; e < num_elem; e++) { + for (CeedInt d = 0; d < dim; d++) { + x_array[num_points_per_elem * (e * dim + d) + 0] = 0.25; + x_array[num_points_per_elem * (e * dim + d) + 1] = d == 0 ? -0.25 : 0.25; + x_array[num_points_per_elem * (e * dim + d) + 2] = d == 0 ? 0.25 : -0.25; + x_array[num_points_per_elem * (e * dim + d) + 3] = 0.25; + } + } + CeedVectorSetArray(x_points, CEED_MEM_HOST, CEED_COPY_VALUES, x_array); + } + { + CeedInt ind_x[num_elem + 1 + num_points]; + + for (CeedInt i = 0; i <= num_elem; i++) ind_x[i] = num_elem + 1 + i * num_points_per_elem; + for (CeedInt i = 0; i < num_points; i++) ind_x[num_elem + 1 + i] = i; + CeedElemRestrictionCreateAtPoints(ceed, num_elem, num_points, dim, num_points * dim, CEED_MEM_HOST, CEED_COPY_VALUES, ind_x, + &elem_restriction_x_points); + CeedElemRestrictionCreateAtPoints(ceed, num_elem, num_points, 1, num_points, CEED_MEM_HOST, CEED_COPY_VALUES, ind_x, &elem_restriction_q_data); + } + + // Vectors + CeedVectorCreate(ceed, dim * num_dofs, &x); + { + CeedScalar x_array[dim * num_dofs]; + + for (CeedInt i = 0; i < n_x * 2 + 1; i++) { + for (CeedInt j = 0; j < n_y * 2 + 1; j++) { + x_array[i + j * (n_x * 2 + 1) + 0 * num_dofs] = (CeedScalar)i / (2 * n_x); + x_array[i + j * (n_x * 2 + 1) + 1 * num_dofs] = (CeedScalar)j / (2 * n_y); + } + } + CeedVectorSetArray(x, CEED_MEM_HOST, CEED_COPY_VALUES, x_array); + } + CeedVectorCreate(ceed, num_comp * num_dofs, &u); + CeedVectorCreate(ceed, num_comp * num_dofs, &v); + CeedVectorCreate(ceed, num_points, &q_data); + + // Restrictions + for (CeedInt i = 0; i < num_elem; i++) { + CeedInt col, row, offset; + col = i % n_x; + row = i / n_x; + offset = col * (p - 1) + row * (n_x * 2 + 1) * (p - 1); + for (CeedInt j = 0; j < p; j++) { + for (CeedInt k = 0; k < p; k++) ind_x[p * (p * i + k) + j] = offset + k * (n_x * 2 + 1) + j; + } + } + CeedElemRestrictionCreate(ceed, num_elem, p * p, dim, num_dofs, dim * num_dofs, CEED_MEM_HOST, CEED_USE_POINTER, ind_x, &elem_restriction_x); + CeedElemRestrictionCreate(ceed, num_elem, p * p, num_comp, num_dofs, num_comp * num_dofs, CEED_MEM_HOST, CEED_USE_POINTER, ind_x, + &elem_restriction_u); + + // Bases + CeedBasisCreateTensorH1Lagrange(ceed, dim, dim, p, q, CEED_GAUSS, &basis_x); + CeedBasisCreateTensorH1Lagrange(ceed, dim, num_comp, p, q, CEED_GAUSS, &basis_u); + + // QFunctions + CeedQFunctionCreateInterior(ceed, 1, setup, setup_loc, &qf_setup); + CeedQFunctionAddInput(qf_setup, "weight", 1, CEED_EVAL_WEIGHT); + CeedQFunctionAddInput(qf_setup, "dx", dim * dim, CEED_EVAL_GRAD); + CeedQFunctionAddOutput(qf_setup, "rho", 1, CEED_EVAL_NONE); + + CeedQFunctionCreateInterior(ceed, 1, mass, mass_loc, &qf_mass); + CeedQFunctionAddInput(qf_mass, "rho", 1, CEED_EVAL_NONE); + CeedQFunctionAddInput(qf_mass, "u", num_comp, CEED_EVAL_INTERP); + CeedQFunctionAddOutput(qf_mass, "v", num_comp, CEED_EVAL_INTERP); + { + CeedQFunctionContext qf_context; + + CeedQFunctionContextCreate(ceed, &qf_context); + CeedQFunctionContextSetData(qf_context, CEED_MEM_HOST, CEED_COPY_VALUES, sizeof(CeedInt), &num_comp); + CeedQFunctionSetContext(qf_mass, qf_context); + CeedQFunctionContextDestroy(&qf_context); + } + + // Operators + CeedOperatorCreateAtPoints(ceed, qf_setup, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_setup); + CeedOperatorSetField(op_setup, "weight", CEED_ELEMRESTRICTION_NONE, basis_x, CEED_VECTOR_NONE); + CeedOperatorSetField(op_setup, "dx", elem_restriction_x, basis_x, CEED_VECTOR_ACTIVE); + CeedOperatorSetField(op_setup, "rho", elem_restriction_q_data, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE); + CeedOperatorAtPointsSetPoints(op_setup, elem_restriction_x_points, x_points); + + CeedOperatorCreateAtPoints(ceed, qf_mass, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_mass); + CeedOperatorSetField(op_mass, "rho", elem_restriction_q_data, CEED_BASIS_NONE, q_data); + CeedOperatorSetField(op_mass, "u", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE); + CeedOperatorSetField(op_mass, "v", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE); + CeedOperatorAtPointsSetPoints(op_mass, elem_restriction_x_points, x_points); + + // Apply Setup Operator + CeedOperatorApply(op_setup, x, q_data, CEED_REQUEST_IMMEDIATE); + + // Fully assemble operator + CeedSize num_entries; + CeedInt *rows; + CeedInt *cols; + CeedVector assembled; + + for (CeedInt k = 0; k < num_comp * num_comp * num_dofs * num_dofs; ++k) { + assembled_values[k] = 0.0; + assembled_true[k] = 0.0; + } + CeedOperatorLinearAssembleSymbolic(op_mass, &num_entries, &rows, &cols); + CeedVectorCreate(ceed, num_entries, &assembled); + CeedOperatorLinearAssemble(op_mass, assembled); + { + const CeedScalar *assembled_array; + + CeedVectorGetArrayRead(assembled, CEED_MEM_HOST, &assembled_array); + for (CeedInt k = 0; k < num_entries; k++) { + assembled_values[rows[k] * num_comp * num_dofs + cols[k]] += assembled_array[k]; + } + CeedVectorRestoreArrayRead(assembled, &assembled_array); + } + + // Manually assemble operator + CeedVectorSetValue(u, 0.0); + for (CeedInt j = 0; j < num_comp * num_dofs; j++) { + CeedScalar *u_array; + const CeedScalar *v_array; + + // Set input + CeedVectorGetArray(u, CEED_MEM_HOST, &u_array); + u_array[j] = 1.0; + if (j) u_array[j - 1] = 0.0; + CeedVectorRestoreArray(u, &u_array); + + // Compute entries for column j + CeedOperatorApply(op_mass, u, v, CEED_REQUEST_IMMEDIATE); + + CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array); + for (CeedInt i = 0; i < num_comp * num_dofs; i++) assembled_true[i * num_comp * num_dofs + j] = v_array[i]; + CeedVectorRestoreArrayRead(v, &v_array); + } + + // Check output + for (CeedInt i = 0; i < num_comp * num_dofs; i++) { + for (CeedInt j = 0; j < num_comp * num_dofs; j++) { + if (fabs(assembled_values[i * num_dofs * num_comp + j] - assembled_true[i * num_dofs * num_comp + j]) > 100. * CEED_EPSILON) { + // LCOV_EXCL_START + printf("[%" CeedInt_FMT ", %" CeedInt_FMT "] Error in assembly: %f != %f\n", i, j, assembled_values[i * num_dofs * num_comp + j], + assembled_true[i * num_dofs * num_comp + j]); + // LCOV_EXCL_STOP + } + } + } + + // Cleanup + free(rows); + free(cols); + CeedVectorDestroy(&x); + CeedVectorDestroy(&x_points); + CeedVectorDestroy(&q_data); + CeedVectorDestroy(&u); + CeedVectorDestroy(&v); + CeedVectorDestroy(&assembled); + CeedElemRestrictionDestroy(&elem_restriction_u); + CeedElemRestrictionDestroy(&elem_restriction_x); + CeedElemRestrictionDestroy(&elem_restriction_x_points); + CeedElemRestrictionDestroy(&elem_restriction_q_data); + CeedBasisDestroy(&basis_u); + CeedBasisDestroy(&basis_x); + CeedQFunctionDestroy(&qf_setup); + CeedQFunctionDestroy(&qf_mass); + CeedOperatorDestroy(&op_setup); + CeedOperatorDestroy(&op_mass); + } + CeedDestroy(&ceed); + return 0; +} diff --git a/tests/t596-operator.h b/tests/t596-operator.h new file mode 100644 index 0000000000..85dc60e259 --- /dev/null +++ b/tests/t596-operator.h @@ -0,0 +1,29 @@ +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. +// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +// +// SPDX-License-Identifier: BSD-2-Clause +// +// This file is part of CEED: http://github.com/ceed + +#include + +CEED_QFUNCTION(setup)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { + const CeedScalar *weight = in[0], *J = in[1]; + CeedScalar *rho = out[0]; + + for (CeedInt i = 0; i < Q; i++) { + rho[i] = weight[i] * (J[i + Q * 0] * J[i + Q * 3] - J[i + Q * 1] * J[i + Q * 2]); + } + return 0; +} + +CEED_QFUNCTION(mass)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { + CeedInt num_comp = *(CeedInt *)ctx; + const CeedScalar *rho = in[0], *u = in[1]; + CeedScalar *v = out[0]; + + for (CeedInt i = 0; i < Q; i++) { + for (CeedInt c = 0; c < num_comp; c++) v[i + c * Q] = rho[i] * c * u[i + c * Q]; + } + return 0; +} diff --git a/tests/t597-operator.c b/tests/t597-operator.c new file mode 100644 index 0000000000..25d6b3cf3f --- /dev/null +++ b/tests/t597-operator.c @@ -0,0 +1,203 @@ +/// @file +/// Test full assembly of Poisson operator AtPoints +/// \test Test full assembly of Poisson operator AtPoints +#include +#include +#include +#include + +#include "t597-operator.h" + +int main(int argc, char **argv) { + Ceed ceed; + + CeedInit(argv[1], &ceed); + + for (CeedInt num_comp = 1; num_comp <= 3; num_comp++) { + CeedElemRestriction elem_restriction_x, elem_restriction_x_points, elem_restriction_u, elem_restriction_q_data; + CeedBasis basis_x, basis_u; + CeedQFunction qf_setup, qf_diff; + CeedOperator op_setup, op_diff; + CeedVector q_data, x, x_points, u, v; + CeedInt p = 3, q = 4, dim = 2; + CeedInt n_x = 3, n_y = 2; + CeedInt num_elem = n_x * n_y; + CeedInt num_dofs = (n_x * 2 + 1) * (n_y * 2 + 1), num_points_per_elem = 4, num_points = num_elem * num_points_per_elem; + CeedInt ind_x[num_elem * p * p]; + CeedScalar assembled_values[num_comp * num_comp * num_dofs * num_dofs]; + CeedScalar assembled_true[num_comp * num_comp * num_dofs * num_dofs]; + + // Points + CeedVectorCreate(ceed, dim * num_points, &x_points); + { + CeedScalar x_array[dim * num_points]; + + for (CeedInt e = 0; e < num_elem; e++) { + for (CeedInt d = 0; d < dim; d++) { + x_array[num_points_per_elem * (e * dim + d) + 0] = 0.25; + x_array[num_points_per_elem * (e * dim + d) + 1] = d == 0 ? -0.25 : 0.25; + x_array[num_points_per_elem * (e * dim + d) + 2] = d == 0 ? 0.25 : -0.25; + x_array[num_points_per_elem * (e * dim + d) + 3] = 0.25; + } + } + CeedVectorSetArray(x_points, CEED_MEM_HOST, CEED_COPY_VALUES, x_array); + } + { + CeedInt ind_x[num_elem + 1 + num_points]; + + for (CeedInt i = 0; i <= num_elem; i++) ind_x[i] = num_elem + 1 + i * num_points_per_elem; + for (CeedInt i = 0; i < num_points; i++) ind_x[num_elem + 1 + i] = i; + CeedElemRestrictionCreateAtPoints(ceed, num_elem, num_points, dim, num_points * dim, CEED_MEM_HOST, CEED_COPY_VALUES, ind_x, + &elem_restriction_x_points); + CeedElemRestrictionCreateAtPoints(ceed, num_elem, num_points, dim * (dim + 1) / 2, num_points * dim * (dim + 1) / 2, CEED_MEM_HOST, + CEED_COPY_VALUES, ind_x, &elem_restriction_q_data); + } + + // Vectors + CeedVectorCreate(ceed, dim * num_dofs, &x); + { + CeedScalar x_array[dim * num_dofs]; + + for (CeedInt i = 0; i < n_x * 2 + 1; i++) { + for (CeedInt j = 0; j < n_y * 2 + 1; j++) { + x_array[i + j * (n_x * 2 + 1) + 0 * num_dofs] = (CeedScalar)i / (2 * n_x); + x_array[i + j * (n_x * 2 + 1) + 1 * num_dofs] = (CeedScalar)j / (2 * n_y); + } + } + CeedVectorSetArray(x, CEED_MEM_HOST, CEED_COPY_VALUES, x_array); + } + CeedVectorCreate(ceed, num_comp * num_dofs, &u); + CeedVectorCreate(ceed, num_comp * num_dofs, &v); + CeedVectorCreate(ceed, num_points * dim * (dim + 1) / 2, &q_data); + + // Restrictions + for (CeedInt i = 0; i < num_elem; i++) { + CeedInt col, row, offset; + col = i % n_x; + row = i / n_x; + offset = col * (p - 1) + row * (n_x * 2 + 1) * (p - 1); + for (CeedInt j = 0; j < p; j++) { + for (CeedInt k = 0; k < p; k++) ind_x[p * (p * i + k) + j] = offset + k * (n_x * 2 + 1) + j; + } + } + CeedElemRestrictionCreate(ceed, num_elem, p * p, dim, num_dofs, dim * num_dofs, CEED_MEM_HOST, CEED_USE_POINTER, ind_x, &elem_restriction_x); + CeedElemRestrictionCreate(ceed, num_elem, p * p, num_comp, num_dofs, num_comp * num_dofs, CEED_MEM_HOST, CEED_USE_POINTER, ind_x, + &elem_restriction_u); + + // Bases + CeedBasisCreateTensorH1Lagrange(ceed, dim, dim, p, q, CEED_GAUSS, &basis_x); + CeedBasisCreateTensorH1Lagrange(ceed, dim, num_comp, p, q, CEED_GAUSS, &basis_u); + + // QFunction - setup + CeedQFunctionCreateInterior(ceed, 1, setup, setup_loc, &qf_setup); + CeedQFunctionAddInput(qf_setup, "dx", dim * dim, CEED_EVAL_GRAD); + CeedQFunctionAddInput(qf_setup, "weight", 1, CEED_EVAL_WEIGHT); + CeedQFunctionAddOutput(qf_setup, "q data", dim * (dim + 1) / 2, CEED_EVAL_NONE); + + // Operator - setup + CeedOperatorCreateAtPoints(ceed, qf_setup, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_setup); + CeedOperatorSetField(op_setup, "dx", elem_restriction_x, basis_x, CEED_VECTOR_ACTIVE); + CeedOperatorSetField(op_setup, "weight", CEED_ELEMRESTRICTION_NONE, basis_x, CEED_VECTOR_NONE); + CeedOperatorSetField(op_setup, "q data", elem_restriction_q_data, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE); + CeedOperatorAtPointsSetPoints(op_setup, elem_restriction_x_points, x_points); + + // Apply Setup Operator + CeedOperatorApply(op_setup, x, q_data, CEED_REQUEST_IMMEDIATE); + + // QFunction - apply + CeedQFunctionCreateInterior(ceed, 1, diff, diff_loc, &qf_diff); + CeedQFunctionAddInput(qf_diff, "du", num_comp * dim, CEED_EVAL_GRAD); + CeedQFunctionAddInput(qf_diff, "q data", dim * (dim + 1) / 2, CEED_EVAL_NONE); + CeedQFunctionAddOutput(qf_diff, "dv", num_comp * dim, CEED_EVAL_GRAD); + { + CeedQFunctionContext qf_context; + + CeedQFunctionContextCreate(ceed, &qf_context); + CeedQFunctionContextSetData(qf_context, CEED_MEM_HOST, CEED_COPY_VALUES, sizeof(CeedInt), &num_comp); + CeedQFunctionSetContext(qf_diff, qf_context); + CeedQFunctionContextDestroy(&qf_context); + } + + // Operator - apply + CeedOperatorCreateAtPoints(ceed, qf_diff, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_diff); + CeedOperatorSetField(op_diff, "du", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE); + CeedOperatorSetField(op_diff, "q data", elem_restriction_q_data, CEED_BASIS_NONE, q_data); + CeedOperatorSetField(op_diff, "dv", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE); + CeedOperatorAtPointsSetPoints(op_diff, elem_restriction_x_points, x_points); + + // Fully assemble operator + CeedSize num_entries; + CeedInt *rows; + CeedInt *cols; + CeedVector assembled; + + for (CeedInt k = 0; k < num_comp * num_comp * num_dofs * num_dofs; ++k) { + assembled_values[k] = 0.0; + assembled_true[k] = 0.0; + } + CeedOperatorLinearAssembleSymbolic(op_diff, &num_entries, &rows, &cols); + CeedVectorCreate(ceed, num_entries, &assembled); + CeedOperatorLinearAssemble(op_diff, assembled); + { + const CeedScalar *assembled_array; + + CeedVectorGetArrayRead(assembled, CEED_MEM_HOST, &assembled_array); + for (CeedInt k = 0; k < num_entries; k++) assembled_values[rows[k] * num_comp * num_dofs + cols[k]] += assembled_array[k]; + CeedVectorRestoreArrayRead(assembled, &assembled_array); + } + + // Manually assemble operator + CeedVectorSetValue(u, 0.0); + for (CeedInt j = 0; j < num_comp * num_dofs; j++) { + CeedScalar *u_array; + const CeedScalar *v_array; + + // Set input + CeedVectorGetArray(u, CEED_MEM_HOST, &u_array); + u_array[j] = 1.0; + if (j) u_array[j - 1] = 0.0; + CeedVectorRestoreArray(u, &u_array); + + // Compute entries for column j + CeedOperatorApply(op_diff, u, v, CEED_REQUEST_IMMEDIATE); + + CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array); + for (CeedInt i = 0; i < num_comp * num_dofs; i++) assembled_true[i * num_comp * num_dofs + j] = v_array[i]; + CeedVectorRestoreArrayRead(v, &v_array); + } + + // Check output + for (CeedInt i = 0; i < num_comp * num_dofs; i++) { + for (CeedInt j = 0; j < num_comp * num_dofs; j++) { + if (fabs(assembled_values[i * num_comp * num_dofs + j] - assembled_true[i * num_comp * num_dofs + j]) > 100. * CEED_EPSILON) { + // LCOV_EXCL_START + printf("[%" CeedInt_FMT ", %" CeedInt_FMT "] Error in assembly: %f != %f\n", i, j, assembled_values[i * num_comp * num_dofs + j], + assembled_true[i * num_comp * num_dofs + j]); + // LCOV_EXCL_STOP + } + } + } + + // Cleanup + free(rows); + free(cols); + CeedVectorDestroy(&x); + CeedVectorDestroy(&x_points); + CeedVectorDestroy(&q_data); + CeedVectorDestroy(&u); + CeedVectorDestroy(&v); + CeedVectorDestroy(&assembled); + CeedElemRestrictionDestroy(&elem_restriction_u); + CeedElemRestrictionDestroy(&elem_restriction_x); + CeedElemRestrictionDestroy(&elem_restriction_x_points); + CeedElemRestrictionDestroy(&elem_restriction_q_data); + CeedBasisDestroy(&basis_u); + CeedBasisDestroy(&basis_x); + CeedQFunctionDestroy(&qf_setup); + CeedQFunctionDestroy(&qf_diff); + CeedOperatorDestroy(&op_setup); + CeedOperatorDestroy(&op_diff); + } + CeedDestroy(&ceed); + return 0; +} diff --git a/tests/t597-operator.h b/tests/t597-operator.h new file mode 100644 index 0000000000..57b8e0dec6 --- /dev/null +++ b/tests/t597-operator.h @@ -0,0 +1,59 @@ +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. +// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +// +// SPDX-License-Identifier: BSD-2-Clause +// +// This file is part of CEED: http://github.com/ceed + +#include + +CEED_QFUNCTION(setup)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { + // At every quadrature point, compute qw/det(J).adj(J).adj(J)^T and store + // the symmetric part of the result. + + // in[0] is Jacobians with shape [2, nc=2, Q] + // in[1] is quadrature weights, size (Q) + const CeedScalar *J = in[0], *qw = in[1]; + + // out[0] is qdata, size (Q) + CeedScalar *qd = out[0]; + + // Quadrature point loop + for (CeedInt i = 0; i < Q; i++) { + // J: 0 2 qd: 0 2 adj(J): J22 -J12 + // 1 3 2 1 -J21 J11 + const CeedScalar J11 = J[i + Q * 0]; + const CeedScalar J21 = J[i + Q * 1]; + const CeedScalar J12 = J[i + Q * 2]; + const CeedScalar J22 = J[i + Q * 3]; + const CeedScalar w = qw[i] / (J11 * J22 - J21 * J12); + qd[i + Q * 0] = w * (J12 * J12 + J22 * J22); + qd[i + Q * 2] = w * (J11 * J11 + J21 * J21); + qd[i + Q * 1] = -w * (J11 * J12 + J21 * J22); + } + + return 0; +} + +CEED_QFUNCTION(diff)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { + CeedInt num_comp = *(CeedInt *)ctx; + // in[0] is gradient u, shape [2, nc=1, Q] + // in[1] is quadrature data, size (3*Q) + const CeedScalar *du = in[0], *qd = in[1]; + + // out[0] is output to multiply against gradient v, shape [2, nc=1, Q] + CeedScalar *dv = out[0]; + + // Quadrature point loop + for (CeedInt i = 0; i < Q; i++) { + for (CeedInt c = 0; c < num_comp; c++) { + const CeedScalar du0 = du[i + Q * (2 * c + 0)]; + const CeedScalar du1 = du[i + Q * (2 * c + 1)]; + + dv[i + Q * (2 * c + 0)] = qd[i + Q * 0] * du0 + qd[i + Q * 2] * du1; + dv[i + Q * (2 * c + 1)] = qd[i + Q * 2] * du0 + qd[i + Q * 1] * du1; + } + } + + return 0; +} diff --git a/tests/t598-operator.c b/tests/t598-operator.c new file mode 100644 index 0000000000..55c7560fbb --- /dev/null +++ b/tests/t598-operator.c @@ -0,0 +1,279 @@ +/// @file +/// Test creation, action, and destruction for mass matrix operator AtPoints +/// \test Test creation, action, and destruction for mass matrix operator AtPoints +#include "t591-operator.h" + +#include +#include +#include +#include + +int main(int argc, char **argv) { + Ceed ceed; + CeedInt num_elem_1d = 3, num_elem = num_elem_1d * num_elem_1d, dim = 2, p_coarse = 2, p_fine = 3, q = 5; + CeedInt num_points_per_elem = 4, num_points = num_elem * num_points_per_elem; + CeedInt num_nodes_coarse = (num_elem_1d * (p_coarse - 1) + 1) * (num_elem_1d * (p_coarse - 1) + 1); + CeedInt num_nodes_fine = (num_elem_1d * (p_fine - 1) + 1) * (num_elem_1d * (p_fine - 1) + 1); + CeedVector x_points, x_elem, q_data, u_coarse, u_fine, v_coarse, v_fine, p_mult_fine; + CeedElemRestriction elem_restriction_x_points, elem_restriction_q_data, elem_restriction_x, elem_restriction_u_coarse, elem_restriction_u_fine; + CeedBasis basis_x, basis_u_coarse, basis_u_fine; + CeedQFunction qf_setup, qf_mass; + CeedOperator op_setup, op_mass_coarse, op_mass_fine, op_prolong, op_restrict; + + CeedInit(argv[1], &ceed); + + // Point reference coordinates + CeedVectorCreate(ceed, dim * num_points, &x_points); + { + CeedScalar x_array[dim * num_points]; + + for (CeedInt e = 0; e < num_elem; e++) { + for (CeedInt d = 0; d < dim; d++) { + x_array[num_points_per_elem * (e * dim + d) + 0] = 0.25; + x_array[num_points_per_elem * (e * dim + d) + 1] = d == 0 ? -0.25 : 0.25; + x_array[num_points_per_elem * (e * dim + d) + 2] = d == 0 ? 0.25 : -0.25; + x_array[num_points_per_elem * (e * dim + d) + 3] = 0.25; + } + } + CeedVectorSetArray(x_points, CEED_MEM_HOST, CEED_COPY_VALUES, x_array); + } + { + CeedInt ind_x[num_elem + 1 + num_points]; + + for (CeedInt i = 0; i <= num_elem; i++) ind_x[i] = num_elem + 1 + i * num_points_per_elem; + for (CeedInt i = 0; i < num_points; i++) ind_x[num_elem + 1 + i] = i; + CeedElemRestrictionCreateAtPoints(ceed, num_elem, num_points, dim, num_points * dim, CEED_MEM_HOST, CEED_COPY_VALUES, ind_x, + &elem_restriction_x_points); + CeedElemRestrictionCreateAtPoints(ceed, num_elem, num_points, 1, num_points, CEED_MEM_HOST, CEED_COPY_VALUES, ind_x, &elem_restriction_q_data); + } + + // Q data + CeedVectorCreate(ceed, num_points, &q_data); + + // Cell coordinates + { + CeedInt p = 2, num_nodes = (num_elem_1d * (p - 1) + 1) * (num_elem_1d * (p - 1) + 1); + CeedInt ind_x[num_elem * p * p]; + + for (CeedInt e = 0; e < num_elem; e++) { + CeedInt elem_xy[2] = {1, 1}, n_d[2] = {0, 0}; + + for (CeedInt d = 0; d < dim; d++) n_d[d] = num_elem_1d * (p - 1) + 1; + { + CeedInt r_e = e; + + for (CeedInt d = 0; d < dim; d++) { + elem_xy[d] = r_e % num_elem_1d; + r_e /= num_elem_1d; + } + } + CeedInt num_nodes_in_elem = p * p, *elem_nodes = ind_x + e * num_nodes_in_elem; + + for (CeedInt n = 0; n < num_nodes_in_elem; n++) { + CeedInt g_node = 0, g_node_stride = 1, r_node = n; + + for (CeedInt d = 0; d < dim; d++) { + g_node += (elem_xy[d] * (p - 1) + r_node % p) * g_node_stride; + g_node_stride *= n_d[d]; + r_node /= p; + } + elem_nodes[n] = p * g_node; + } + } + CeedElemRestrictionCreate(ceed, num_elem, p * p, dim, 1, dim * num_nodes, CEED_MEM_HOST, CEED_COPY_VALUES, ind_x, &elem_restriction_x); + CeedVectorCreate(ceed, dim * num_nodes, &x_elem); + { + CeedScalar x_array[dim * num_nodes]; + + for (CeedInt i = 0; i <= num_elem_1d; i++) { + for (CeedInt j = 0; j <= num_elem_1d; j++) { + x_array[(i * (num_elem_1d + 1) + j) * dim + 0] = j; + x_array[(i * (num_elem_1d + 1) + j) * dim + 1] = i; + } + } + CeedVectorSetArray(x_elem, CEED_MEM_HOST, CEED_COPY_VALUES, x_array); + } + } + + CeedBasisCreateTensorH1Lagrange(ceed, dim, dim, 2, q, CEED_GAUSS, &basis_x); + + // Cell solution + { + CeedInt ind_u[num_elem * p_coarse * p_coarse]; + + for (CeedInt e = 0; e < num_elem; e++) { + CeedInt elem_xy[2] = {1, 1}, n_d[2] = {0, 0}; + + for (CeedInt d = 0; d < dim; d++) n_d[d] = num_elem_1d * (p_coarse - 1) + 1; + { + CeedInt r_e = e; + + for (CeedInt d = 0; d < dim; d++) { + elem_xy[d] = r_e % num_elem_1d; + r_e /= num_elem_1d; + } + } + CeedInt num_nodes_in_elem = p_coarse * p_coarse, *elem_nodes = ind_u + e * num_nodes_in_elem; + + for (CeedInt n = 0; n < num_nodes_in_elem; n++) { + CeedInt g_node = 0, g_node_stride = 1, r_node = n; + + for (CeedInt d = 0; d < dim; d++) { + g_node += (elem_xy[d] * (p_coarse - 1) + r_node % p_coarse) * g_node_stride; + g_node_stride *= n_d[d]; + r_node /= p_coarse; + } + elem_nodes[n] = g_node; + } + } + CeedElemRestrictionCreate(ceed, num_elem, p_coarse * p_coarse, 1, 1, num_nodes_coarse, CEED_MEM_HOST, CEED_COPY_VALUES, ind_u, + &elem_restriction_u_coarse); + } + CeedBasisCreateTensorH1Lagrange(ceed, dim, 1, p_coarse, q, CEED_GAUSS, &basis_u_coarse); + { + CeedInt ind_u[num_elem * p_fine * p_fine]; + + for (CeedInt e = 0; e < num_elem; e++) { + CeedInt elem_xy[2] = {1, 1}, n_d[2] = {0, 0}; + + for (CeedInt d = 0; d < dim; d++) n_d[d] = num_elem_1d * (p_fine - 1) + 1; + { + CeedInt r_e = e; + + for (CeedInt d = 0; d < dim; d++) { + elem_xy[d] = r_e % num_elem_1d; + r_e /= num_elem_1d; + } + } + CeedInt num_nodes_in_elem = p_fine * p_fine, *elem_nodes = ind_u + e * num_nodes_in_elem; + + for (CeedInt n = 0; n < num_nodes_in_elem; n++) { + CeedInt g_node = 0, g_node_stride = 1, r_node = n; + + for (CeedInt d = 0; d < dim; d++) { + g_node += (elem_xy[d] * (p_fine - 1) + r_node % p_fine) * g_node_stride; + g_node_stride *= n_d[d]; + r_node /= p_fine; + } + elem_nodes[n] = g_node; + } + } + CeedElemRestrictionCreate(ceed, num_elem, p_fine * p_fine, 1, 1, num_nodes_fine, CEED_MEM_HOST, CEED_COPY_VALUES, ind_u, + &elem_restriction_u_fine); + } + CeedBasisCreateTensorH1Lagrange(ceed, dim, 1, p_fine, q, CEED_GAUSS, &basis_u_fine); + + // Setup geometric scaling + CeedQFunctionCreateInterior(ceed, 1, setup, setup_loc, &qf_setup); + CeedQFunctionAddInput(qf_setup, "x", dim * dim, CEED_EVAL_GRAD); + CeedQFunctionAddInput(qf_setup, "weight", 1, CEED_EVAL_WEIGHT); + CeedQFunctionAddOutput(qf_setup, "rho", 1, CEED_EVAL_NONE); + + CeedOperatorCreateAtPoints(ceed, qf_setup, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_setup); + CeedOperatorSetField(op_setup, "x", elem_restriction_x, basis_x, CEED_VECTOR_ACTIVE); + CeedOperatorSetField(op_setup, "weight", CEED_ELEMRESTRICTION_NONE, basis_x, CEED_VECTOR_NONE); + CeedOperatorSetField(op_setup, "rho", elem_restriction_q_data, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE); + CeedOperatorAtPointsSetPoints(op_setup, elem_restriction_x_points, x_points); + + CeedOperatorApply(op_setup, x_elem, q_data, CEED_REQUEST_IMMEDIATE); + + // Mass operator + CeedQFunctionCreateInterior(ceed, 1, mass, mass_loc, &qf_mass); + CeedQFunctionAddInput(qf_mass, "u", 1, CEED_EVAL_INTERP); + CeedQFunctionAddInput(qf_mass, "rho", 1, CEED_EVAL_NONE); + CeedQFunctionAddOutput(qf_mass, "v", 1, CEED_EVAL_INTERP); + + CeedOperatorCreateAtPoints(ceed, qf_mass, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_mass_fine); + CeedOperatorSetField(op_mass_fine, "u", elem_restriction_u_fine, basis_u_fine, CEED_VECTOR_ACTIVE); + CeedOperatorSetField(op_mass_fine, "rho", elem_restriction_q_data, CEED_BASIS_NONE, q_data); + CeedOperatorSetField(op_mass_fine, "v", elem_restriction_u_fine, basis_u_fine, CEED_VECTOR_ACTIVE); + CeedOperatorAtPointsSetPoints(op_mass_fine, elem_restriction_x_points, x_points); + + CeedVectorCreate(ceed, num_nodes_fine, &u_fine); + CeedVectorCreate(ceed, num_nodes_fine, &v_fine); + CeedVectorCreate(ceed, num_nodes_fine, &p_mult_fine); + CeedVectorCreate(ceed, num_nodes_coarse, &u_coarse); + CeedVectorCreate(ceed, num_nodes_coarse, &v_coarse); + + // Create multigrid level + CeedVectorSetValue(p_mult_fine, 1.0); + CeedOperatorMultigridLevelCreate(op_mass_fine, p_mult_fine, elem_restriction_u_coarse, basis_u_coarse, &op_mass_coarse, &op_prolong, &op_restrict); + + // Coarse problem + CeedVectorSetValue(u_coarse, 1.0); + CeedOperatorApply(op_mass_coarse, u_coarse, v_coarse, CEED_REQUEST_IMMEDIATE); + + // Check output + { + const CeedScalar *v_array; + CeedScalar sum = 0.; + + CeedVectorGetArrayRead(v_coarse, CEED_MEM_HOST, &v_array); + for (CeedInt i = 0; i < num_nodes_coarse; i++) { + sum += v_array[i]; + } + CeedVectorRestoreArrayRead(v_coarse, &v_array); + if (fabs(sum - num_elem) > 1000. * CEED_EPSILON) printf("Computed Area Coarse Grid: %f != True Area: 2.0\n", sum); + } + + // Prolong coarse u + CeedOperatorApply(op_prolong, u_coarse, u_fine, CEED_REQUEST_IMMEDIATE); + + // Fine problem + CeedOperatorApply(op_mass_fine, u_fine, v_fine, CEED_REQUEST_IMMEDIATE); + + // Check output + { + const CeedScalar *v_array; + CeedScalar sum = 0.; + + CeedVectorGetArrayRead(v_fine, CEED_MEM_HOST, &v_array); + for (CeedInt i = 0; i < num_nodes_fine; i++) { + sum += v_array[i]; + } + CeedVectorRestoreArrayRead(v_fine, &v_array); + + if (fabs(sum - num_elem) > 1000. * CEED_EPSILON) printf("Computed Area Fine Grid: %f != True Area: 2.0\n", sum); + } + // Restrict state to coarse grid + CeedOperatorApply(op_restrict, v_fine, v_coarse, CEED_REQUEST_IMMEDIATE); + + // Check output + { + const CeedScalar *v_array; + CeedScalar sum = 0.; + + CeedVectorGetArrayRead(v_coarse, CEED_MEM_HOST, &v_array); + for (CeedInt i = 0; i < num_nodes_coarse; i++) { + sum += v_array[i]; + } + CeedVectorRestoreArrayRead(v_coarse, &v_array); + if (fabs(sum - num_elem) > 1000. * CEED_EPSILON) printf("Computed Area Coarse Grid: %f != True Area: 2.0\n", sum); + } + + CeedVectorDestroy(&x_points); + CeedVectorDestroy(&q_data); + CeedVectorDestroy(&x_elem); + CeedVectorDestroy(&u_coarse); + CeedVectorDestroy(&u_fine); + CeedVectorDestroy(&v_fine); + CeedVectorDestroy(&v_coarse); + CeedVectorDestroy(&p_mult_fine); + CeedElemRestrictionDestroy(&elem_restriction_x_points); + CeedElemRestrictionDestroy(&elem_restriction_q_data); + CeedElemRestrictionDestroy(&elem_restriction_x); + CeedElemRestrictionDestroy(&elem_restriction_u_coarse); + CeedElemRestrictionDestroy(&elem_restriction_u_fine); + CeedBasisDestroy(&basis_x); + CeedBasisDestroy(&basis_u_coarse); + CeedBasisDestroy(&basis_u_fine); + CeedQFunctionDestroy(&qf_setup); + CeedQFunctionDestroy(&qf_mass); + CeedOperatorDestroy(&op_setup); + CeedOperatorDestroy(&op_mass_coarse); + CeedOperatorDestroy(&op_mass_fine); + CeedOperatorDestroy(&op_prolong); + CeedOperatorDestroy(&op_restrict); + CeedDestroy(&ceed); + return 0; +} diff --git a/tests/t599-operator.c b/tests/t599-operator.c new file mode 100644 index 0000000000..a38d6b1f47 --- /dev/null +++ b/tests/t599-operator.c @@ -0,0 +1,148 @@ +/// @file +/// Test creation, action, and destruction for mass matrix operator at points using sequential composite operator +/// \test Test creation, action, and destruction for mass matrix operator at points using sequential composite operator +#include "t599-operator.h" + +#include +#include +#include + +int main(int argc, char **argv) { + Ceed ceed; + CeedInt num_elem_1d = 3, num_elem = num_elem_1d * num_elem_1d, dim = 2, p = 3, q = 5; + CeedInt num_nodes = (num_elem_1d * (p - 1) + 1) * (num_elem_1d * (p - 1) + 1), num_points_per_elem = 4, num_points = num_elem * num_points_per_elem; + CeedVector x_points, u, v, u_points; + CeedElemRestriction elem_restriction_x_points, elem_restriction_u_points, elem_restriction_u; + CeedBasis basis_u; + CeedQFunction qf_to_points, qf_from_points; + CeedOperator op_to_points, op_from_points, op_mass; + bool is_at_points, is_sequential; + + CeedInit(argv[1], &ceed); + + CeedVectorCreate(ceed, dim * num_points, &x_points); + { + CeedScalar x_array[dim * num_points]; + + for (CeedInt e = 0; e < num_elem; e++) { + for (CeedInt d = 0; d < dim; d++) { + x_array[num_points_per_elem * (e * dim + d) + 0] = 0.25; + x_array[num_points_per_elem * (e * dim + d) + 1] = d == 0 ? -0.25 : 0.25; + x_array[num_points_per_elem * (e * dim + d) + 2] = d == 0 ? 0.25 : -0.25; + x_array[num_points_per_elem * (e * dim + d) + 3] = 0.25; + } + } + CeedVectorSetArray(x_points, CEED_MEM_HOST, CEED_COPY_VALUES, x_array); + } + { + CeedInt ind_x[num_elem + 1 + num_points]; + + for (CeedInt i = 0; i <= num_elem; i++) ind_x[i] = num_elem + 1 + i * num_points_per_elem; + for (CeedInt i = 0; i < num_points; i++) ind_x[num_elem + 1 + i] = i; + CeedElemRestrictionCreateAtPoints(ceed, num_elem, num_points, dim, num_points * dim, CEED_MEM_HOST, CEED_COPY_VALUES, ind_x, + &elem_restriction_x_points); + CeedElemRestrictionCreateAtPoints(ceed, num_elem, num_points, 1, num_points, CEED_MEM_HOST, CEED_COPY_VALUES, ind_x, &elem_restriction_u_points); + CeedElemRestrictionCreateVector(elem_restriction_u_points, &u_points, NULL); + CeedVectorSetValue(u_points, 0); + } + + { + CeedInt ind_u[num_elem * p * p]; + + for (CeedInt e = 0; e < num_elem; e++) { + CeedInt elem_xy[2] = {1, 1}, n_d[2] = {0, 0}; + + for (CeedInt d = 0; d < dim; d++) n_d[d] = num_elem_1d * (p - 1) + 1; + { + CeedInt r_e = e; + + for (CeedInt d = 0; d < dim; d++) { + elem_xy[d] = r_e % num_elem_1d; + r_e /= num_elem_1d; + } + } + CeedInt num_nodes_in_elem = p * p, *elem_nodes = ind_u + e * num_nodes_in_elem; + + for (CeedInt n = 0; n < num_nodes_in_elem; n++) { + CeedInt g_node = 0, g_node_stride = 1, r_node = n; + + for (CeedInt d = 0; d < dim; d++) { + g_node += (elem_xy[d] * (p - 1) + r_node % p) * g_node_stride; + g_node_stride *= n_d[d]; + r_node /= p; + } + elem_nodes[n] = g_node; + } + } + CeedElemRestrictionCreate(ceed, num_elem, p * p, 1, 1, num_nodes, CEED_MEM_HOST, CEED_COPY_VALUES, ind_u, &elem_restriction_u); + } + CeedBasisCreateTensorH1Lagrange(ceed, dim, 1, p, q, CEED_GAUSS, &basis_u); + + CeedQFunctionCreateInterior(ceed, 1, mass, mass_loc, &qf_to_points); + CeedQFunctionAddInput(qf_to_points, "u", 1, CEED_EVAL_INTERP); + CeedQFunctionAddOutput(qf_to_points, "u_points", 1, CEED_EVAL_NONE); + + CeedOperatorCreateAtPoints(ceed, qf_to_points, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_to_points); + CeedOperatorSetField(op_to_points, "u", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE); + CeedOperatorSetField(op_to_points, "u_points", elem_restriction_u_points, CEED_BASIS_NONE, u_points); + CeedOperatorAtPointsSetPoints(op_to_points, elem_restriction_x_points, x_points); + + CeedOperatorIsAtPoints(op_to_points, &is_at_points); + if (!is_at_points) printf("Error: Operator should be at points\n"); + + CeedQFunctionCreateInterior(ceed, 1, mass, mass_loc, &qf_from_points); + CeedQFunctionAddInput(qf_from_points, "u_points", 1, CEED_EVAL_NONE); + CeedQFunctionAddOutput(qf_from_points, "v", 1, CEED_EVAL_INTERP); + + CeedOperatorCreateAtPoints(ceed, qf_from_points, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_from_points); + CeedOperatorSetField(op_from_points, "u_points", elem_restriction_u_points, CEED_BASIS_NONE, u_points); + CeedOperatorSetField(op_from_points, "v", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE); + CeedOperatorAtPointsSetPoints(op_from_points, elem_restriction_x_points, x_points); + + CeedOperatorIsAtPoints(op_from_points, &is_at_points); + if (!is_at_points) printf("Error: Operator should be at points\n"); + + CeedOperatorCreateComposite(ceed, &op_mass); + CeedOperatorCompositeSetSequential(op_mass, true); + CeedOperatorCompositeAddSub(op_mass, op_to_points); + CeedOperatorCompositeAddSub(op_mass, op_from_points); + + CeedVectorCreate(ceed, num_nodes, &u); + CeedVectorSetValue(u, 1.0); + CeedVectorCreate(ceed, num_nodes, &v); + CeedOperatorApply(op_mass, u, v, CEED_REQUEST_IMMEDIATE); + + CeedOperatorCompositeIsSequential(op_mass, &is_sequential); + if (!is_sequential) printf("Error: Composite operator should be sequential\n"); + + { + CeedScalar sum = 0.0; + const CeedScalar *v_array; + + CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array); + for (CeedInt i = 0; i < num_nodes; i++) sum += v_array[i]; + CeedVectorRestoreArrayRead(v, &v_array); + // Summing 9 reference elements, each 2x2 => 36 sq units area + if (fabs(sum - 4.0 * num_elem) > CEED_EPSILON * 5e3) { + // LCOV_EXCL_START + printf("Incorrect area computed, %g != %g (abs error %g)\n", sum, 4.0 * num_elem, fabs(sum - 4.0 * num_elem)); + // LCOV_EXCL_STOP + } + } + + CeedVectorDestroy(&x_points); + CeedVectorDestroy(&u_points); + CeedVectorDestroy(&u); + CeedVectorDestroy(&v); + CeedElemRestrictionDestroy(&elem_restriction_x_points); + CeedElemRestrictionDestroy(&elem_restriction_u_points); + CeedElemRestrictionDestroy(&elem_restriction_u); + CeedBasisDestroy(&basis_u); + CeedQFunctionDestroy(&qf_to_points); + CeedQFunctionDestroy(&qf_from_points); + CeedOperatorDestroy(&op_to_points); + CeedOperatorDestroy(&op_from_points); + CeedOperatorDestroy(&op_mass); + CeedDestroy(&ceed); + return 0; +} diff --git a/tests/t599-operator.h b/tests/t599-operator.h new file mode 100644 index 0000000000..c50595bc26 --- /dev/null +++ b/tests/t599-operator.h @@ -0,0 +1,16 @@ +// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. +// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +// +// SPDX-License-Identifier: BSD-2-Clause +// +// This file is part of CEED: http://github.com/ceed + +#include + +CEED_QFUNCTION(mass)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { + const CeedScalar *u = in[0]; + CeedScalar *v = out[0]; + + for (CeedInt i = 0; i < Q; i++) v[i] = u[i]; + return 0; +} diff --git a/tests/test-include/fake-sys-include.h b/tests/test-include/fake-sys-include.h new file mode 100644 index 0000000000..edb954cb54 --- /dev/null +++ b/tests/test-include/fake-sys-include.h @@ -0,0 +1,14 @@ +#define FAKE_SYS_SCALE_ONE 1 + +// Note - files included this way cannot transitively include any files CUDA/ROCm won't compile +// These are bad and need to be guarded +#ifndef CEED_RUNNING_JIT_PASS +#include +#include +#endif + +// These are ok +// Note - ceed/types.h should be used over ceed.h +// ceed.h is replaced with ceed/types.h during JiT +#include +#include